Beispiel #1
0
class Game(object):
    def __init__(self):
        self.args = args = agent.parse_args()
        self.ep = EnvPool(args.env, self.args.env_size)
        self.eps = [
            MultiStageEpsilon([
                LinearAnnealEpsilon(1.0, 0.1, int(1e6)),
                LinearAnnealEpsilon(0.1, 0.05, int(1e7 - 1e6))
            ]), 0
        ]
        self.replay = ReplayBuffer(args.replay_buffer_size)
        main_logger.info("Replay Buffer Max Size: {}B".format(
            pretty_num(args.replay_buffer_size * (84 * 84 * 4 * 2 + 8), True)))
        self.sess = agent.make_session()
        self.sess.__enter__()
        agent.setup(self.ep.action_num, self.replay)
        self.train_epi = 0
        self.max_reward = agent.score

    def random(self):
        random_step = self.args.replay_buffer_size // 2
        obs = self.ep.reset()
        with tqdm(total=random_step, desc="random", ascii=True) as t:
            while t.n < random_step:
                action, (obs_, reward, done, info) = self.ep.random()
                [
                    self.replay.add(obs[i], action[i], reward[i],
                                    float(done[i]), obs_[i])
                    for i in range(self.ep.size)
                ]
                obs, info = self.ep.auto_reset()
                t.update(self.ep.size)
        total_epi = sum(len(info[i]['rewards']) for i in range(self.ep.size))
        mean_reward = np.mean([
            np.mean(info[i]['rewards']) for i in range(self.ep.size)
            if info[i]['rewards']
        ])
        record = Record()
        record.add_key_value('Phase', 'Random')
        record.add_key_value('Episodes', pretty_num(total_epi))
        record.add_key_value('Mean Reward', np.round(mean_reward, 2))
        main_logger.info("\n" + record.dumps())
        if not self.max_reward:
            self.max_reward = mean_reward

    def train(self):
        train_step = 250000
        self.ep.reset_state()
        obs = self.ep.reset()
        with tqdm(total=train_step, desc="Train", ascii=True) as t:
            while t.n < train_step:
                action = agent.take_action(
                    obs, self.eps[0].get(self.train_epi * train_step + t.n))
                obs_, reward, done, info = self.ep.step(action)
                [
                    self.replay.add(obs[i], action[i], reward[i],
                                    float(done[i]), obs_[i])
                    for i in range(self.ep.size)
                ]
                obs, info = self.ep.auto_reset()
                if t.n % self.args.target_update_freq == 0:
                    agent.update_target()
                if t.n % self.args.learning_freq == 0:
                    agent.train(self.ep.size)
                t.update(self.ep.size)
        self.train_epi += 1
        completion = np.round(self.train_epi / self.args.num_iters, 2)
        total_epi = sum(len(info[i]['rewards']) for i in range(self.ep.size))
        mean_reward = np.mean([
            np.mean(info[i]['rewards'][-100:]) for i in range(self.ep.size)
            if info[i]['rewards']
        ])
        record = Record()
        record.add_key_value('Phase', 'Train')
        record.add_key_value('% Completion', completion)
        record.add_key_value('Episodes', pretty_num(total_epi))
        record.add_key_value(
            '% Exploration',
            np.round(self.eps[0].get(self.train_epi * train_step) * 100, 2))
        record.add_key_value('Reward (100 epi mean)', np.round(mean_reward, 2))
        main_logger.info("\n" + record.dumps())

    def test(self):
        test_step = 200000
        self.ep.reset_state()
        obs = self.ep.reset()
        with tqdm(total=test_step, desc="Evaluation", ascii=True) as t:
            while t.n < test_step:
                action = agent.take_action(obs, self.eps[1])
                self.ep.step(action)
                obs, info = self.ep.auto_reset()
                t.update(self.ep.size)
        total_epi = sum(len(info[i]['rewards']) for i in range(self.ep.size))
        mean_reward = np.mean([
            np.mean(info[i]['rewards']) for i in range(self.ep.size)
            if info[i]['rewards']
        ])
        record = Record()
        record.add_key_value('Phase', 'Evaluation')
        record.add_key_value('Episodes', pretty_num(total_epi))
        record.add_key_value('Mean Reward', np.round(mean_reward, 2))
        main_logger.info("\n" + record.dumps())
        if self.max_reward < mean_reward:
            self.max_reward = mean_reward
            agent.score = mean_reward
            agent.save_model()

    def run(self):
        self.random()
        for i in range(self.args.num_iters):
            self.train()
            self.test()
        self.exit()

    def exit(self):
        self.ep.close()
Beispiel #2
0
class TrainDQN:
    def __init__(self,
                 env,
                 sess,
                 learning_rate=1e-3,
                 seed=1234,
                 gamma=0.99,
                 max_eps=1.0,
                 min_eps=0.1,
                 render=False,
                 print_freq=20,
                 load_path=None,
                 save_path=None,
                 batch_size=32,
                 log_dir='logs/train',
                 max_steps=100000,
                 buffer_capacity=None,
                 max_episode_len=2000,
                 eps_decay_rate=-0.0001,
                 target_update_freq=1000,
                 ):
        """Trains an openai gym-like environment with deep q learning.
        Args:
            env: gym.Env where our agent resides
            seed: Random seed for reproducibility
            gamma: Discount factor
            max_eps: Starting exploration factor
            min_eps: Exploration factor to decay towards
            max_episode_len: Maximum length of an individual episode
            render: True to render the environment, else False
            print_freq: Displays logging information every 'print_freq' episodes
            load_path: (str) Path to load existing model from
            save_path: (str) Path to save model during training
            max_steps: maximum number of times to sample the environment
            buffer_capacity: How many state, action, next state, reward tuples the replay buffer should store
            max_episode_len: Maximum number of timesteps in an episode
            eps_decay_rate: lambda parameter in exponential decay for epsilon
            target_update_fraction: Fraction of max_steps update the target network
        """
        np.random.seed(seed)
        self.sess = sess
        self.env = env
        self.input_dim = env.observation_space.shape[0]
        self.output_dim = env.action_space.n
        self.max_steps = max_steps
        self.max_eps = max_eps
        self.min_eps = min_eps
        self.eps_decay_rate = eps_decay_rate
        self.max_episode_len = max_episode_len
        self.render = render
        self.print_freq = print_freq
        self.rewards = []
        self.metrics = []
        self.save_path = save_path
        self.load_path = load_path
        self.batch_size = batch_size
        self.num_updates = 0
        self.gamma = gamma
        self.buffer = ReplayBuffer(capacity=max_steps // 2 if buffer_capacity is None else buffer_capacity)
        self.target_update_freq = target_update_freq
        self.learning_rate = learning_rate

        with tf.variable_scope('q_network'):
            self.q_network = QNetworkBuilder(self.input_dim, self.output_dim, (64,))
        with tf.variable_scope('target_network'):
            self.target_network = QNetworkBuilder(self.input_dim, self.output_dim, (64,))
        self.update_target_network = [old.assign(new) for (new, old) in
                                      zip(tf.trainable_variables('q_network'),
                                          tf.trainable_variables('target_network'))]
        if self.load_path is not None:
            self.load()

        self.add_summaries(log_dir)

    def add_summaries(self, log_dir):
        tf.summary.scalar('Loss', self.q_network.loss, )
        tf.summary.scalar('Mean Estimated Value', tf.reduce_mean(self.q_network.output_pred))
        # Merge all the summaries and write them out to log_dir
        self.merged = tf.summary.merge_all()
        self.train_writer = tf.summary.FileWriter(log_dir, self.sess.graph)

    def learn(self):
        """Learns via Deep-Q-Networks (DQN)"""
        obs = self.env.reset()
        mean_reward = None
        total_reward = 0
        ep = 0
        ep_len = 0
        rand_actions = 0
        for t in range(self.max_steps):
            # weight decay from https://jaromiru.com/2016/10/03/lets-make-a-dqn-implementation/
            eps = self.min_eps + (self.max_eps - self.min_eps) * np.exp(
                self.eps_decay_rate * t)
            if self.render:
                self.env.render()

            # Take exploratory action with probability epsilon
            if np.random.uniform() < eps:
                action = self.env.action_space.sample()
                rand_actions += 1
            else:
                action = self.act(obs)

            # Execute action in emulator and observe reward and next state
            new_obs, reward, done, info = self.env.step(action)
            total_reward += reward

            # Store transition s_t, a_t, r_t, s_t+1 in replay buffer
            self.buffer.add((obs, action, reward, new_obs, done))

            # Perform learning step
            self.update()

            obs = new_obs
            ep_len += 1
            if done or ep_len >= self.max_episode_len:
                #         print("Episode Length:", ep_len)
                #         print(f"Episode {ep} Reward:{total_reward}")
                #         print(f"Random Action Percent: {rand_actions/ep_len}")
                ep += 1
                ep_len = 0
                rand_actions = 0
                self.rewards.append(total_reward)
                total_reward = 0
                obs = self.env.reset()

                if ep % self.print_freq == 0 and ep > 0:
                    new_mean_reward = np.mean(self.rewards[-self.print_freq - 1:])

                    print(f"-------------------------------------------------------")
                    print(f"Mean {self.print_freq} Episode Reward: {new_mean_reward}")
                    print(f"Exploration fraction: {eps}")
                    print(f"Total Episodes: {ep}")
                    print(f"Total timesteps: {t}")
                    print(f"-------------------------------------------------------")

                    # Add reward summary
                    summary = tf.Summary()
                    summary.value.add(tag=f'Mean {self.print_freq} Episode Reward',
                                      simple_value=new_mean_reward)
                    summary.value.add(tag=f'Epsilon', simple_value=eps)
                    self.train_writer.add_summary(summary, self.num_updates)

                    # Model saving inspired by Open AI Baseline implementation
                    if (mean_reward is None or new_mean_reward >= mean_reward) and self.save_path is not None:
                        print(f"Saving model due to mean reward increase:{mean_reward} -> {new_mean_reward}")
                        print(f'Location: {self.save_path}')
                        # save_path = f"{self.save_path}_model"
                        self.save()
                        mean_reward = new_mean_reward

    def act(self, observation):
        """Takes an action given the observation.
        Args:
            observation: observation from the environment
        Returns:
            integer index of the selected action
        """
        pred = self.sess.run([self.q_network.output_pred],
                             feed_dict={self.q_network.input_ph: np.reshape(observation, (1, self.input_dim))})
        return np.argmax(pred)

    def update(self):
        """Applies gradients to the Q network computed from a minibatch of self.batch_size."""
        if self.batch_size <= self.buffer.size():
            self.num_updates += 1

            # Update the Q network with model parameters from the target network
            if self.num_updates % self.target_update_freq == 0:
                self.sess.run(self.update_target_network)
                print('Updated Target Network')

            # Sample random minibatch of transitions from the replay buffer
            sample = self.buffer.sample(self.batch_size)
            states, action, reward, next_states, done = sample

            # Calculate discounted predictions for the subsequent states using target network
            next_state_pred = self.gamma * self.sess.run(self.target_network.output_pred,
                                                         feed_dict={
                                                             self.target_network.input_ph: next_states}, )

            # Adjust the targets for non-terminal states
            reward = reward.reshape(len(reward), 1)
            targets = reward
            loc = np.argwhere(done != True).flatten()
            if len(loc) > 0:
                max_q = np.amax(next_state_pred, axis=1)
                targets[loc] = np.add(
                    targets[loc],
                    max_q[loc].reshape(max_q[loc].shape[0], 1),
                    casting='unsafe')

            # Update discount factor and train model on batch
            _, loss = self.sess.run([self.q_network.opt, self.q_network.loss],
                                    feed_dict={self.q_network.input_ph: states,
                                               self.q_network.target_ph: targets.flatten(),
                                               self.q_network.action_indices_ph: action})

    def save(self):
        """Saves the Q network."""
        self.q_network.saver.save(self.sess, self.save_path)

    def load(self):
        """Loads the Q network."""
        self.q_network.saver.restore(self.sess, self.save_path)

    def plot_rewards(self, path=None):
        """Plots rewards per episode.
        Args:
            path: Location to save the rewards plot. If None, image will be displayed with plt.show()
        """
        plt.plot(self.rewards)
        plt.xlabel('Episode')
        plt.ylabel('Reward')
        if path is None:
            plt.show()
        else:
            plt.savefig(path)
            plt.close('all')
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)

        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # get targets
        self.qnetwork_target.eval()
        with torch.no_grad():
            Q_targets_next = torch.max(self.qnetwork_target.forward(next_states), dim=1, keepdim=True)[0]

        Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones))

        # get outputs
        self.qnetwork_local.train()
        Q_expected = self.qnetwork_local.forward(states).gather(1, actions)

        # compute loss
        loss = F.mse_loss(Q_expected, Q_targets)

        # clear gradients
        self.optimizer.zero_grad()

        # update weights local network
        loss.backward()

        # take one SGD step
        self.optimizer.step()
        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
Beispiel #4
0
            for agent_index in range(3):
                summary_writer.add_summary(sess.run(reward_1000_op[agent_index], {reward_1000[agent_index]: np.mean(reward_100_list[agent_index])}), i // 1000)
        agent1_action, agent2_action, agent3_action = get_agents_action(o_n, sess, noise_rate=0.2)
        

        a = [[0, i[0][0], 0, i[0][1], 0] for i in [agent1_action, agent2_action, agent3_action]]

        a.append([0, np.random.rand() * 2 - 1, 0, np.random.rand() * 2 - 1, 0])

        o_n_next, r_n, d_n, i_n = env.step(a)

        for agent_index in range(3):
            reward_100_list[agent_index].append(r_n[agent_index])
            reward_100_list[agent_index] = reward_100_list[agent_index][-1000:]

        agent1_memory.add(o_n[0], agent1_action[0], r_n[0], o_n_next[0], False)
        agent2_memory.add(o_n[1], agent2_action[0], r_n[1], o_n_next[1], False)
        agent3_memory.add(o_n[2], agent3_action[0], r_n[2], o_n_next[2], False)

        if i > 50000:
            #print('train')
            e *= 0.9999
            # agent1 train
            train_agent(agent1_ddpg, agent1_ddpg_target, agent1_memory, agent1_actor_target_update,
                        agent1_critic_target_update, sess)

            train_agent(agent2_ddpg, agent2_ddpg_target, agent2_memory, agent2_actor_target_update,
                        agent2_critic_target_update, sess)

            train_agent(agent3_ddpg, agent3_ddpg_target, agent3_memory, agent3_actor_target_update,
                        agent3_critic_target_update, sess)
Beispiel #5
0
            for agent_index in range(3):
                summary_writer.add_summary(sess.run(reward_1000_op[agent_index], {reward_1000[agent_index]: np.mean(reward_100_list[agent_index])}), i // 1000)

        agent1_action, agent2_action, agent3_action = get_agents_action(o_n, sess, noise_rate=0.2)

        a = [[0, i[0][0], 0, i[0][1], 0] for i in [agent1_action, agent2_action, agent3_action]]

        a.append([0, np.random.rand() * 2 - 1, 0, np.random.rand() * 2 - 1, 0])

        o_n_next, r_n, d_n, i_n = env.step(a)

        for agent_index in range(3):
            reward_100_list[agent_index].append(r_n[agent_index])
            reward_100_list[agent_index] = reward_100_list[agent_index][-1000:]

        agent1_memory.add(np.vstack([o_n[0], o_n[1], o_n[2]]), np.vstack([agent1_action[0], agent2_action[0], agent3_action[0]]),
                          r_n[0], np.vstack([o_n_next[0], o_n_next[1], o_n_next[2]]), False)

        agent2_memory.add(np.vstack([o_n[1], o_n[2], o_n[0]]), np.vstack([agent2_action[0], agent3_action[0], agent1_action[0]]),
                          r_n[1], np.vstack([o_n_next[1], o_n_next[2], o_n_next[0]]), False)

        agent3_memory.add(np.vstack([o_n[2], o_n[0], o_n[1]]), np.vstack([agent3_action[0], agent1_action[0], agent2_action[0]]),
                          r_n[2], np.vstack([o_n_next[2], o_n_next[0], o_n_next[1]]), False)

        if i > 50000:
            # e *= 0.9999
            # agent1 train
            train_agent(agent1_ddpg, agent1_ddpg_target, agent1_memory, agent1_actor_target_update,
                        agent1_critic_target_update, sess, [agent2_ddpg_target, agent3_ddpg_target])

            train_agent(agent2_ddpg, agent2_ddpg_target, agent2_memory, agent2_actor_target_update,
                        agent2_critic_target_update, sess, [agent3_ddpg_target, agent1_ddpg_target])
Beispiel #6
0
class NeuralNetworkAgent(Agent):
    def __init__(self,
                 api,
                 network_class,
                 sess,
                 save_path,
                 history_size=15,
                 restore_path=None,
                 verbose=False,
                 train=False,
                 test=False):
        super(NeuralNetworkAgent, self).__init__(api, verbose=verbose)

        # currently 7500 w/ 1000

        # Network
        self.network = network_class(sess,
                                     save_path,
                                     restore_path=restore_path,
                                     hist_size=history_size)
        self.replay_buffer = ReplayBuffer(max_size=2500)
        self.train = train
        self.history_size = history_size

        # Internal
        self.launched = False
        self.placed_move = False
        self.ctr = 0
        self.restart_game = 1
        self.game_restarted = True
        self.show_board = False
        self.last_move = -2
        self.start_state = np.zeros((20, 10, 1))
        self.possible_moves = [-1, 0, 6, 7]
        self.training_begun = False if not test else True
        self.epsilon = 1. if not test else 0
        self.decay = 0.999
        self.test = test

        self.prev_states = [self.start_state] * self.history_size

    def _controller_listener(self):
        piece_id = self.api.peekCPU(0x0042)
        game_state = self.api.peekCPU(0x0048)

        if piece_id != 19 and game_state == 1:
            # Train
            if self.train and self.replay_buffer.size(
            ) > 250 and not self.test:
                batch = self.replay_buffer.sample(batch_sz=250)
                self.network.train(batch)
                self.training_begun = True

                self.epsilon *= self.decay
                if self.epsilon < 0.010:
                    self.epsilon = 0.010

        if not self.placed_move:  # and (random_move >= 0 or self.restart_game > 0):
            # os.system('clear')
            print '--------------'
            is_random = False
            move = None
            if np.random.random() < self.epsilon or not self.training_begun:
                move = np.random.choice(self.possible_moves)
                is_random = True
            else:
                tensor = np.dstack([self.grid] + self.prev_states)
                pred = self.network.predict(tensor)[0]
                move = self.possible_moves[pred]

            if self.restart_game > 0:
                self.api.writeGamepad(0, 3, True)
                self.restart_game -= 1
                move = -2
            else:
                if move >= 0:
                    self.api.writeGamepad(0, move, True)
            self.placed_move = True
            self.show_board = True

            if self.last_move != -2 and piece_id != 19:
                print 'Random:', is_random
                S = self.grid.copy()
                self._update_board(self.api.peekCPU(0x0042))
                board = self._simulate_piece_drop(self.api.peekCPU(0x0042))
                n_empty = self._count_empty(self.grid)
                n_holes = self._count_holes(self.grid)
                height = self._count_height(board)
                levelness = self._determine_levelness(board)
                A = self.last_move
                # R  = self._count_total() + self._get_score() - n_empty
                #R = (-50 * height) + (-20 * n_holes) + (self._get_score())
                if height <= 2:
                    R = 1000
                else:
                    R = -200 * height
                R += -20 * n_holes + 10 * levelness  # 10 * self._get_score()
                SP = self.grid.copy()

                self.prev_states.insert(0, S)

                print np.dstack(self.prev_states).shape

                self.replay_buffer.add(
                    np.dstack(self.prev_states), self.possible_moves.index(A),
                    R, np.dstack([SP] + self.prev_states[:self.history_size]))

                self.prev_states = self.prev_states[:self.history_size]

                print self.epsilon
                self._print_transition(S, A, board, R)

            self.last_move = move
        else:
            self.placed_move = False

    def _frame_render_finished(self):
        """
        Renders the board the the current piece
        TODO: do this lazily, so we aren't calling read too often O_o
        """

        # To make things easier, we're going to modify the next piece drop
        # Always drop a certain type of block (currently square).
        self.api.writeCPU(0x00bf, 0x0a)

        piece_id = self.api.peekCPU(0x0042)
        game_state = self.api.peekCPU(0x0048)

        # Restart the game
        if piece_id == 19 and (game_state == 10 or game_state == 0):
            self.prev_states = [self.start_state] * self.history_size
            self.game_restarted = True
            self.restart_game = 1
            return

        # Probably a line clear... Skip
        if piece_id == 19 and game_state != 1:
            return

    def _piece_update(self, access_type, address, value):
        """
        Can be used to control the piece being dropped
        """
        if self.api.readCPU(0x0048) == 1:
            return 0x0a
        return value

    def agent_name(self):
        return 'NeuralNetworkAgent'
Beispiel #7
0
class DQN:
    def __init__(
        self,
        env,
        learning_rate=1e-3,
        seed=1234,
        gamma=0.99,
        max_eps=1.0,
        min_eps=0.1,
        render=False,
        print_freq=1,
        load_path=None,
        save_path=None,
        batch_size=32,
        log_dir='logs/train',
        max_steps=100000,
        buffer_capacity=None,
        max_episode_len=None,
        eps_decay_rate=-1e-4,
        target_update_freq=1000,
    ):
        tf.random.set_seed(seed)
        np.random.seed(seed)
        self.gamma = gamma
        self.render = render
        self.batch_size = batch_size
        self.print_freq = print_freq
        self.q_lr = learning_rate
        self.max_eps = max_eps
        self.min_eps = min_eps
        self.eps_decay_rate = eps_decay_rate
        self.buffer = ReplayBuffer(buffer_capacity)
        self.max_steps = max_steps
        self.target_update = target_update_freq
        self.model = QNetwork(env.action_space.n, name='q_network')
        self.target = QNetwork(env.action_space.n, name='target_network')
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.q_lr)
        self.summary_writer = tf.summary.create_file_writer(log_dir)
        self.env = env
        self.max_episode_len = max_episode_len if max_episode_len else self.env.spec.max_episode_steps
        self.rewards = []
        self.save_path = save_path

        if load_path is not None:
            self.model.load_weights(load_path)

    def act(self, state):
        return np.argmax(self.model(state))

    @tf.function
    def train_step(self, states, indices, targets):
        """
        Performs a single step of gradient descent on the Q network

        Args:
            states: numpy array of states with shape (batch size, state dim)
            indices: list indices of the selected actions
            targets: targets for computing the MSE loss

        """
        with tf.GradientTape() as tape:
            action_values = tf.gather_nd(self.model(states), indices)
            mse_loss = tf.keras.losses.MeanSquaredError()(action_values,
                                                          targets)

        gradients = tape.gradient(mse_loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(
            zip(gradients, self.model.trainable_variables))

        # Log training information
        with self.summary_writer.as_default():
            tf.summary.scalar('MSE Loss',
                              mse_loss,
                              step=self.optimizer.iterations)
            tf.summary.scalar('Estimated Q Value',
                              tf.reduce_mean(action_values),
                              step=self.optimizer.iterations)

    def update(self):
        """
        Computes the target for the MSE loss and calls the tf.function for gradient descent
        """
        if len(self.buffer) >= self.batch_size:
            # Sample random minibatch of N transitions
            states, actions, rewards, next_states, dones = self.buffer.sample(
                self.batch_size)

            # Adjust the targets for non-terminal states
            next_state_pred = self.target(next_states)
            targets = rewards + self.gamma * next_state_pred.numpy().max(
                axis=1) * (1 - dones)
            batch_range = tf.range(start=0, limit=actions.shape[0])
            indices = tf.stack((batch_range, actions), axis=1)

            # update critic by minimizing the MSE loss
            self.train_step(states, indices, targets)

    def learn(self):
        """Learns via Deep-Q-Networks (DQN)"""
        obs = self.env.reset()
        total_reward = 0
        ep = 0
        ep_len = 0
        rand_actions = 0
        mean_reward = None
        for t in range(self.max_steps):

            if t % self.target_update == 0:
                copy_weights(self.model.variables, self.target.variables)

            # weight decay from https://jaromiru.com/2016/10/03/lets-make-a-dqn-implementation/
            eps = self.min_eps + (self.max_eps - self.min_eps) * np.exp(
                self.eps_decay_rate * t)
            if self.render:
                self.env.render()

            # Take exploratory action with probability epsilon
            if np.random.uniform() < eps:
                action = self.env.action_space.sample()
                rand_actions += 1
            else:
                action = self.act(np.expand_dims(obs, axis=0))

            # Execute action in emulator and observe reward and next state
            new_obs, reward, done, info = self.env.step(action)
            total_reward += reward

            # Store transition s_t, a_t, r_t, s_t+1 in replay buffer
            self.buffer.add((obs, action, reward, new_obs, done))

            # Perform learning step
            self.update()

            obs = new_obs
            ep_len += 1
            if done or ep_len >= self.max_episode_len:
                with self.summary_writer.as_default():
                    ep += 1
                    self.rewards.append(total_reward)
                    total_reward = 0
                    obs = self.env.reset()

                    if ep % self.print_freq == 0 and ep > 0:
                        new_mean_reward = np.mean(
                            self.rewards[-self.print_freq - 1:])

                        print(
                            f"-------------------------------------------------------"
                        )
                        print(
                            f"Mean {self.print_freq} Episode Reward: {new_mean_reward}"
                        )
                        print(f"Exploration fraction: {rand_actions / ep_len}")
                        print(f"Total Episodes: {ep}")
                        print(f"Total timesteps: {t}")
                        print(
                            f"-------------------------------------------------------"
                        )

                        tf.summary.scalar(
                            f'Mean {self.print_freq} Episode Reward',
                            new_mean_reward,
                            step=t)
                        tf.summary.scalar(f'Epsilon', eps, step=t)

                        # Model saving inspired by Open AI Baseline implementation
                        if (mean_reward is None or new_mean_reward >=
                                mean_reward) and self.save_path is not None:
                            print(
                                f"Saving model due to mean reward increase:{mean_reward} -> {new_mean_reward}"
                            )
                            print(f'Location: {self.save_path}')
                            mean_reward = new_mean_reward
                            self.model.save_weights(self.save_path)

                    ep_len = 0
                    rand_actions = 0
def play(train_indicator):
    buffer_size = 100000
    batch_size = 32
    gamma = 0.99  # discount factor
    tau = 0.001  # Target Network HyperParameter
    lra = 0.0001  # Learning rate for Actor
    lrc = 0.001  # Learning rate for Critic
    ou_sigma = 0.3

    action_dim = 1  # Steering angle
    state_dim = 21  # num of sensors input

    episodes_num = 2000
    max_steps = 100000
    step = 0

    train_stat_file = "data/train_stat.txt"
    actor_weights_file = "data/actor.h5"
    critic_weights_file = "data/critic.h5"

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    tf_session = tf.Session(config=config)

    keras_backend.set_session(tf_session)

    actor = ActorNetwork(tf_session=tf_session,
                         state_size=state_dim,
                         action_size=action_dim,
                         hidden_units=(300, 600),
                         tau=tau,
                         lr=lra)
    critic = CriticNetwork(tf_session=tf_session,
                           state_size=state_dim,
                           action_size=action_dim,
                           hidden_units=(300, 600),
                           tau=tau,
                           lr=lrc)
    buffer = ReplayBuffer(buffer_size)

    # noise function for exploration
    ou = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim),
                                      sigma=ou_sigma * np.ones(action_dim))

    # Torcs environment - throttle and gear change controlled by client
    env = TorcsEnv(vision=False, throttle=False, gear_change=False)

    try:
        actor.model.load_weights(actor_weights_file)
        critic.model.load_weights(critic_weights_file)
        actor.target_model.load_weights(actor_weights_file)
        critic.target_model.load_weights(critic_weights_file)
        print("Weights loaded successfully")
    except:
        print("Cannot load weights")

    for i in range(episodes_num):
        print("Episode : %s Replay buffer %s" % (i, len(buffer)))

        if i % 3 == 0:
            ob = env.reset(
                relaunch=True
            )  # relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        # 21 len state dimensions - https://arxiv.org/abs/1304.1672
        state = np.hstack((ob.angle, ob.track, ob.trackPos))

        total_reward = 0.
        for j in range(max_steps):
            loss = 0

            action_predicted = actor.model.predict(
                state.reshape(1,
                              state.shape[0])) + ou()  # predict and add noise

            observation, reward, done, info = env.step(action_predicted[0])

            state1 = np.hstack(
                (observation.angle, observation.track, observation.trackPos))

            buffer.add((state, action_predicted[0], reward, state1,
                        done))  # add replay buffer

            # batch update
            batch = buffer.get_batch(batch_size)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + gamma * target_q_values[k]

            if train_indicator:
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.get_gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.train_target_model()
                critic.train_target_model()

            total_reward += reward
            state = state1

            print("Episode %s - Step %s - Action %s - Reward %s" %
                  (i, step, action_predicted[0][0], reward))

            step += 1
            if done:
                break

        if i % 3 == 0 and train_indicator:
            print("Saving weights...")
            actor.model.save_weights(actor_weights_file, overwrite=True)
            critic.model.save_weights(critic_weights_file, overwrite=True)

        tm = time.strftime("%Y-%m-%d %H:%M:%S")
        episode_stat = "%s -th Episode. %s total steps. Total reward: %s. Time %s" % (
            i, step, total_reward, tm)
        print(episode_stat)
        with open(train_stat_file, "a") as outfile:
            outfile.write(episode_stat + "\n")

    env.end()
Beispiel #9
0
def train(conf,
          env,
          model,
          num_episodes=500,
          batch_size=100,
          buffer_size=10000):
    conf.buffer_size = buffer_size
    conf.batch_size = batch_size

    replay_buffer = ReplayBuffer(size=buffer_size)
    discount_rate = conf.discount_rate
    eps = conf.initial_eps
    decay_factor = conf.decay_factor
    for episode in range(num_episodes):
        print("Episode {}".format(episode))
        observation = env.reset()
        eps *= decay_factor
        done = False
        total_food = 0
        step = 0
        while not done:
            model_input = np.array([observation])
            prediction = model.predict(model_input)
            if np.random.random() < eps:
                action = np.random.randint(0, 4)
                was_random = True
            else:
                action = np.argmax(prediction)
                was_random = False

            debugger.print_step_before_move(step, observation, prediction,
                                            action, was_random)

            debugger.render_env_until_key_press(env)

            new_observation, reward, done, _ = env.step(action)

            replay_buffer.add(observation, action, reward, new_observation,
                              float(done))

            # target_action_score = reward + (0 if done else discount_rate * np.max(model.predict(
            #     np.array([new_observation]))))

            # label = prediction
            # label[0][action] = target_action_score
            # model.fit(model_input, label, epochs=1,
            #           verbose=0)

            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)
            labels = model.predict(obses_t)
            targets = discount_rate * np.max(model.predict(obses_tp1), axis=1)
            # print('targets', targets)
            # print('rewards', rewards)
            for i in range(len(dones)):
                if dones[i]:
                    targets[i] = 0
                targets[i] += rewards[i]
                labels[i][actions[i]] = targets[i]
            model.fit(obses_t, labels, epochs=1, verbose=0)

            weights, batch_idxes = np.ones_like(rewards), None

            # debugger.print_step_after_move(reward, target_action_score,
            #                       label, model.predict(model_input))

            if (reward > 0):
                total_food += 1
            step += 1

            observation = new_observation
        wandb.log({
            'episode': episode,
            'total_food': total_food,
            'eps': eps,
            'lifetime': step
        })
        print('Score: {}'.format(total_food))
        print()
    env.close()