def test_observation_zeroing(self):
        """ Tests zeroing out of frames not from current episode """
        obs_shape = (84, 84, 1)
        er = ExperienceReplay(5, obs_shape)

        for terminal_idx in range(5):
            obs_ = []
            obs_next_ = []
            for i in range(1, 6):
                partial_obs = np.ones(obs_shape) * i
                terminal = 1 if i == terminal_idx else 0
                er.append(partial_obs, 0, 0, terminal)

                if i <= terminal_idx:
                    partial_obs *= 0
                if i < 5:
                    obs_.append(partial_obs)
                if i > 1:
                    obs_next_.append(partial_obs)
            obs_ = np.transpose(np.array(obs_), (3, 1, 2, 0))
            obs_next_ = np.transpose(np.array(obs_next_), (3, 1, 2, 0))

            batch = er.sample(1)
            obs, rewards, actions, obs_next, terminals = batch
            assert np.array_equal(obs_, obs)
            assert np.array_equal(obs_next_, obs_next)
    def test_sampling(self):
        """ Tests observation construction from partial observations """
        obs_shape = (84, 84, 1)
        er = ExperienceReplay(5, obs_shape)

        for i in range(1, 6):
            partial_obs = np.ones(obs_shape) * i
            er.append(partial_obs, 1, 1, 0)

        batch = er.sample(1)
        _, rewards, actions, _, terminals = batch
        assert np.array_equal(rewards, np.array([1]))
        assert np.array_equal(actions, np.array([1]))
        assert np.array_equal(terminals, np.array([0]))
    def test_observation_construction(self):
        """ Tests observation construction from partial observations """
        obs_shape = (84, 84, 1)
        er = ExperienceReplay(5, obs_shape)

        obs_ = []
        obs_next_ = []
        for i in range(1, 6):
            partial_obs = np.ones(obs_shape) * i
            if i < 5:
                obs_.append(partial_obs)
            if i > 1:
                obs_next_.append(partial_obs)
            er.append(partial_obs, 0, 0, 0)
        obs_ = np.transpose(np.array(obs_), (3, 1, 2, 0))
        obs_next_ = np.transpose(np.array(obs_next_), (3, 1, 2, 0))

        batch = er.sample(1)
        obs, rewards, actions, obs_next, terminals = batch
        assert np.array_equal(obs_, obs)
        assert np.array_equal(obs_next_, obs_next)
class DQN:
    def __init__(self):
        self.batch_size = 64  # How many experiences to use for each training step
        self.train_frequency = 5  # How often you update the network
        self.num_epochs = 20  # How many epochs to train when updating the network
        self.y = 0.99  # Discount factor
        self.prob_random_start = 0.6  # Starting chance of random action
        self.prob_random_end = 0.1  # Ending chance of random action
        self.annealing_steps = 1000.  # Steps of training to reduce from start_e -> end_e
        self.max_num_episodes = 10000  # Max number of episodes you are allowes to played to train the game
        self.min_pre_train_episodes = 100  # Number of episodes played with random actions before to start training.
        self.max_num_step = 50  # Maximum allowed episode length
        self.goal = 15  # Number of rewards we want to achieve while playing a game.

        # Set env
        self.env = gameEnv(partial=False, size=5)

        # Reset everything from keras session
        K.clear_session()

        # Setup our Q-networks
        self.main_qn = Qnetwork()
        self.target_qn = Qnetwork()

        # Setup our experience replay
        self.experience_replay = ExperienceReplay()

    def update_target_graph(self):
        updated_weights = np.array(self.main_qn.model.get_weights())
        self.target_qn.model.set_weights(updated_weights)

    def choose_action(self, state, prob_random, num_episode):
        if np.random.rand() < prob_random or \
                num_episode < self.min_pre_train_episodes:
            # Act randomly based on prob_random or if we
            # have not accumulated enough pre_train episodes
            action = np.random.randint(self.env.actions)
        else:
            # Decide what action to take from the Q network
            # First add one dimension to the netword to fit expected dimension of the network
            state = np.expand_dims(state, axis=0)
            action = np.argmax(self.main_qn.model.predict(state))
        return action

    def run_one_episode(self, num_episode, prob_random):
        # Create an experience replay for the current episode.
        experiences_episode = []

        # Get the game state from the environment
        state = self.env.reset()

        done = False  # Game is complete
        cur_step = 0  # Running sum of number of steps taken in episode

        while cur_step < self.max_num_step and not done:
            cur_step += 1
            action = self.choose_action(state=state,
                                        prob_random=prob_random,
                                        num_episode=num_episode)

            # Take the action and retrieve the next state, reward and done
            next_state, reward, done = self.env.step(action)

            # Setup the experience to be stored in the episode buffer
            experience = [state, action, reward, next_state, done]

            # Store the experience in the episode buffer
            experiences_episode.append(experience)

            # Update the state
            state = next_state

        return experiences_episode

    def generate_target_q(self, train_state, train_action, train_reward,
                          train_next_state, train_done):
        # Our predictions (actions to take) from the main Q network
        target_q = self.main_qn.model.predict(train_state)

        # Tells us whether game over or not
        # We will multiply our rewards by this value
        # to ensure we don't train on the last move
        train_gameover = train_done == 0

        # Q value of the next state based on action
        target_q_next_state = self.target_qn.model.predict(train_next_state)
        train_next_state_values = np.max(target_q_next_state[range(
            self.batch_size)],
                                         axis=1)

        # Reward from the action chosen in the train batch
        actual_reward = train_reward + (self.y * train_next_state_values *
                                        train_gameover)
        target_q[range(self.batch_size), train_action] = actual_reward
        return target_q

    def train_one_step(self):
        # Train batch is [[state,action,reward,next_state,done],...]
        train_batch = self.experience_replay.sample(self.batch_size)

        # Separate the batch into numpy array for each compents
        train_state = np.array([x[0] for x in train_batch])
        train_action = np.array([x[1] for x in train_batch])
        train_reward = np.array([x[2] for x in train_batch])
        train_next_state = np.array([x[3] for x in train_batch])
        train_done = np.array([x[4] for x in train_batch])

        # Generate target Q
        target_q = self.generate_target_q(train_state=train_state,
                                          train_action=train_action,
                                          train_reward=train_reward,
                                          train_next_state=train_next_state,
                                          train_done=train_done)

        # Train the main model
        loss = self.main_qn.model.train_on_batch(train_state, target_q)
        return loss

    def train(self):

        # Make the networks equal
        self.update_target_graph()

        # We'll begin by acting complete randomly. As we gain experience and improve,
        # we will begin reducing the probability of acting randomly, and instead
        # take the actions that our Q network suggests
        prob_random = self.prob_random_start
        prob_random_drop = (self.prob_random_start -
                            self.prob_random_end) / self.annealing_steps

        # Init variable
        num_steps = []  # Tracks number of steps per episode
        rewards = []  # Tracks rewards per episode
        print_every = 50  # How often to print status
        losses = [0]  # Tracking training losses
        num_episode = 0

        while True:
            # Run one episode
            experiences_episode = self.run_one_episode(num_episode,
                                                       prob_random)

            # Save the episode in the replay buffer
            self.experience_replay.add(experiences_episode)

            # If we have play enoug episode. Start the training
            if num_episode > self.min_pre_train_episodes:

                # Drop the probability of a random action if wi didn't reach the prob_random_end value
                if prob_random > self.prob_random_end:
                    prob_random -= prob_random_drop

                # Every train_frequency iteration, train the model
                if num_episode % self.train_frequency == 0:
                    for num_epoch in range(self.num_epochs):
                        loss = self.train_one_step()
                        losses.append(loss)

                    # Update the target model with values from the main model
                    self.update_target_graph()

            # Increment the episode
            num_episode += 1
            num_steps.append(len(experiences_episode))
            rewards.append(sum([e[2] for e in experiences_episode]))

            # Print Info
            if num_episode % print_every == 0:
                # datetime object containing current date and time
                now = datetime.now()
                dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
                mean_loss = np.mean(losses[-(print_every * self.num_epochs):])
                print(
                    "{} - Num episode: {} Mean reward: {:0.4f} Prob random: {:0.4f}, Loss: {:0.04f}"
                    .format(dt_string, num_episode,
                            np.mean(rewards[-print_every:]), prob_random,
                            mean_loss))

            # Stop Condition
            if np.mean(rewards[-print_every:]) >= self.goal:
                now = datetime.now()
                dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
                mean_loss = np.mean(losses[-(print_every * self.num_epochs):])
                print(
                    "{} - Num episode: {} Mean reward: {:0.4f} Prob random: {:0.4f}, Loss: {:0.04f}"
                    .format(dt_string, num_episode,
                            np.mean(rewards[-print_every:]), prob_random,
                            mean_loss))
                print("Training complete because we reached goal rewards.")
                break
            if num_episode > self.max_num_episodes:
                print("Training Stop because we reached max num of episodes")
                break
class DQN:
    """ Implementation of deep q learning algorithm """
    def __init__(self):

        self.prob_random = 1.0  # Probability to play random action
        self.y = .99  # Discount factor
        self.batch_size = 64  # How many experiences to use for each training step
        self.prob_random_end = .01  # Ending chance of random action
        self.prob_random_decay = .996  # Decrease decay of the prob random
        self.max_episode = 300  # Max number of episodes you are allowes to played to train the game
        self.expected_goal = 200  # Expected goal

        self.dnn = DNN()
        self.env = gym.make('CartPole-v0')

        self.memory = ExperienceReplay(buffer_size=10000)

        self.metadata = [
        ]  # we will store here info score, at the end of each episode

    def choose_action(self, state, prob_random):
        if np.random.rand() <= prob_random:
            action = np.random.randint(self.env.action_space.n)
        else:
            action = np.argmax(self.dnn.model.predict(state))
        return action

    def run_one_step(self, state):
        action = self.choose_action(state, self.prob_random)
        next_state, reward, done, _ = self.env.step(action)
        next_state = np.expand_dims(next_state, axis=0)
        return state, action, reward, next_state, done

    def generate_target_q(self, train_state, train_action, train_reward,
                          train_next_state, train_done):

        # Our predictions (actions to take) from the main Q network
        target_q = self.dnn.model.predict(train_state)

        # Tells us whether game over or not
        # We will multiply our rewards by this value
        # to ensure we don't train on the last move
        train_gameover = train_done == 0

        # Q value of the next state based on action
        target_q_next_state = self.dnn.model.predict(train_next_state)
        train_next_state_values = np.max(target_q_next_state[range(
            self.batch_size)],
                                         axis=1)

        # Reward from the action chosen in the train batch
        actual_reward = train_reward + (self.y * train_next_state_values *
                                        train_gameover)
        target_q[range(self.batch_size), train_action] = actual_reward
        return target_q

    def train_one_step(self):

        batch_data = self.memory.sample(self.batch_size)
        train_state = np.array([i[0] for i in batch_data])
        train_action = np.array([i[1] for i in batch_data])
        train_reward = np.array([i[2] for i in batch_data])
        train_next_state = np.array([i[3] for i in batch_data])
        train_done = np.array([i[4] for i in batch_data])

        # These lines remove useless dimension of the matrix
        train_state = np.squeeze(train_state)
        train_next_state = np.squeeze(train_next_state)

        # Generate target Q
        target_q = self.generate_target_q(train_state=train_state,
                                          train_action=train_action,
                                          train_reward=train_reward,
                                          train_next_state=train_next_state,
                                          train_done=train_done)

        loss = self.dnn.model.train_on_batch(train_state, target_q)
        return loss

    def train(self):
        scores = []
        for e in range(self.max_episode):
            # Init New episode
            state = self.env.reset()
            state = np.expand_dims(state, axis=0)
            episode_score = 0
            while True:
                state, action, reward, next_state, done = self.run_one_step(
                    state)
                self.memory.add(
                    experiences=[[state, action, reward, next_state, done]])
                episode_score += reward
                state = next_state
                if len(self.memory.buffer) > self.batch_size:
                    self.train_one_step()
                    if self.prob_random > self.prob_random_end:
                        self.prob_random *= self.prob_random_decay
                if done:
                    now = datetime.now()
                    dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
                    self.metadata.append(
                        [now, e, episode_score, self.prob_random])
                    print(
                        "{} - episode: {}/{}, score: {:.1f} - prob_random {:.3f}"
                        .format(dt_string, e, self.max_episode, episode_score,
                                self.prob_random))
                    break
            scores.append(episode_score)

            # Average score of last 100 episode
            means_last_10_scores = np.mean(scores[-10:])
            if means_last_10_scores == self.expected_goal:
                print('\n Task Completed! \n')
                break
            print("Average over last 10 episode: {0:.2f} \n".format(
                means_last_10_scores))
        print("Maximum number of episode played: %d" % self.max_episode)
Beispiel #6
0
                logger.log("stats/max_tile", np.max(g.state), i)
                logger.log("stats/best_score", best_score, i)

                logger.log("settings/epsilon", epsilon, i)
                logger.log("settings/num_random_moves", num_random_moves, i)
                logger.log("settings/perc_random_moves",
                           num_random_moves / num_moves, i)
                logger.log("settings/experience", len(replay), i)

                reward = 0
                replay.add(
                    (state, action_onehot, reward, np.zeros(state.shape)))

            if i > OBSERVE:

                batch = replay.sample(batch_size=32)

                states = []
                actions = []
                rewards = []
                next_states = []
                for e, b in enumerate(batch):
                    states.append(b[0])
                    actions.append(b[1])
                    rewards.append(b[2])
                    next_states.append(b[3])
                states = np.array(states)
                actions = np.array(actions)
                rewards = np.array(rewards)
                next_states = np.array(next_states)
Beispiel #7
0
    # Interleave planning and learning steps
    print("\nInterleaving planning and learning steps.", flush=True)
    actor.reset()
    steps_cnt = 0
    episode_steps = 0
    episodes_cnt = 0
    while episodes_cnt < n_episodes:
        r, episode_done = planning_step(actor=actor,
                                        planner=planner,
                                        dataset=experience_replay,
                                        policy_fn=network_policy,
                                        tree_budget=tree_budget,
                                        cache_subtree=cache_subtree,
                                        discount_factor=discount_factor)

        # Learning step
        batch = experience_replay.sample(batch_size)
        loss, _ = learner.train_step(tf.constant(batch["observations"], dtype=tf.float32),
                                     tf.constant(batch["target_policy"], dtype=tf.float32))

        steps_cnt += 1
        episode_steps +=1
        print("\n".join([" ".join(row) for row in env.unwrapped.get_char_matrix(actor.tree.root.data["s"])]),
              "Reward: ", r, "Simulator steps:", actor.nodes_generated,
              "Planning steps:", steps_cnt, "Loss:", loss.numpy(), "\n")
        if episode_done:
            print("Problem solved in %i steps (min 13 steps)."%episode_steps)
            actor.reset()
            episodes_cnt += 1
            episode_steps = 0
            if episodes_cnt < n_episodes: print("\n------- New episode -------")
Beispiel #8
0
        # get after take action
        newstate, reward, done, _ = env.step(action)
        if (newstate == []):
            print("Terminate")
            break
        replay_ep.add(
            np.reshape(np.array([state, action, reward, done, newstate]),
                       [1, 5]))
        # train
        if config.total_step > config.args.num_pretrain_step:
            if epsilon > config.args.end_epsilon:
                epsilon -= epsilon_decay

            if config.total_step % config.args.online_update_freq == 0:
                train_batch = replay.sample(config.args.batch_size)
                loss = qnet.learn_on_minibatch(train_batch, config.args.gamma)
                sys.stdout.write(
                    "\rTrain step at {}th step | loss {} | epsilon {}".format(
                        config.total_step, loss, epsilon))
                sys.stdout.flush()

            if config.total_step % config.args.target_update_freq == 0:

                # print("Update target net")
                qnet.update_target_model(config.args.tau)

        config.total_step += 1
        total_reward += reward
        state = newstate
        if done:
Beispiel #9
0
class Agent:
    def __init__(self, s_size, a_size, seed):
        """

        Parameters:
            s_size (int): dimension of each state
            a_size (int): dimension of each action
            seed (int): random seed
        """
        self.s_size = s_size
        self.a_size = a_size
        self.seed = random.seed(seed)

        # Initialize both the Q-networks
        self.local_dqn = Model(s_size, a_size, seed).to(device)
        self.target_dqn = Model(s_size, a_size, seed).to(device)
        self.optimizer = optim.Adam(self.local_dqn.parameters(),
                                    lr=c.LEARNING_RATE)

        # Initialize experience deque
        self.buffer = ExperienceReplay(a_size, c.REPLAY_BUFFER_SIZE,
                                       c.BATCH_SIZE, seed)

        # Time step counter used for updating as per UPDATE_FREQUENCY
        self.t_step = 0

    def step(self, s, a, r, s_next, done, transfer_method):
        # Add experience to dequeue
        self.buffer.add(s, a, r, s_next, done)

        # Learn if UPDATE_FREQUENCY matched.
        self.t_step = (self.t_step + 1) % c.UPDATE_FREQUENCY
        if self.t_step == 0:
            # Get random experiences to learn from.
            if len(self.buffer) > c.BATCH_SIZE:
                es = self.buffer.sample()
                self.learn(es, transfer_method, c.GAMMA)

    def act(self, state, transfer_method, eps=0.):
        """Returns actions for given state as per current policy.

        Parameters:
            state (array_like): current state
            isTransfer (int): 0 if pre-trained weights to be used, int otherwise
            eps (float): epsilon, for exploration
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.local_dqn.eval()
        with torch.no_grad():
            a_values = self.local_dqn(state, transfer_method)
        self.local_dqn.train()

        # Generate a random number. If larger than epsilon pick greedy, or random otherwise
        if random.random() > eps:
            return np.argmax(a_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.a_size))

    def learn(self, es, transfer_method, gamma):
        """Update parameters based on experiences.

        Parameters:
            es (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        s_, a_, r_, s_next_, d_ = es

        # Max predicted Q-values
        target_Q_next = self.target_dqn(
            s_next_, transfer_method).detach().max(1)[0].unsqueeze(1)

        # Target Q-value
        target_Q = r_ + (gamma * target_Q_next * (1 - d_))

        # Expected Q-vales
        expected_Q = self.local_dqn(s_, transfer_method).gather(1, a_)

        loss = F.mse_loss(expected_Q, target_Q)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target network
        update(self.local_dqn, self.target_dqn, c.TAU)
Beispiel #10
0
class DQN(object):
    """
    OOP for a Deep Q-Network (DQN). 
    """
    def __init__(self, game, memory_size = 100000, 
                 batch_size = 1, epsilon_init = 1.0, alpha_init = .00025,
                 anneal_alpha = True, anneal_epsilon = True, 
                 batch_size_incr = 0):
        self.memories = ExperienceReplay(memory_size)
        self.nnet = LeNet(game.state_shape, dim_out = game.n_actions, 
                          batch_size = 1, fc_dim = 500,
                          nkerns = [16,32], filter_dims = [2,2],
                          out_type = 'linear')
        self.trainer = single_batch_trainer(self.nnet)
        self.game = game
        self.n_episodes = 0
        self.avg_rewards = []
        self.avg_action_vals = []
        self.alpha = alpha_init
        self.epsilon = epsilon_init
        self.anneal_ep = anneal_epsilon
        self.anneal_lr = anneal_alpha
        self.batch_size = batch_size
        self.batch_size_incr = batch_size_incr
        self._pct_invalids = []
        self._costs = []
            
    def train(self, n_episodes = 3, max_iter = 500):
        g = self.game
        g.reset()
        # set anneal rate for epsilon.
        ep_anneal_rate = 0
        if self.anneal_ep:
            ep_anneal_rate = float(self.epsilon)/n_episodes
        alpha_anneal_rate = 0
        if self.anneal_lr:
            alpha_anneal_rate = float(self.alpha)/n_episodes
        for e_idx in range(n_episodes):
            s = g.get_state()
            print "Episode: %d, Exploration Rate: %f, Learning Rate: %f" %(e_idx, self.epsilon, self.alpha)
            while not g.is_terminal() and not self.game._num_moves >= max_iter and not self.game.iter_ctr >= 200:
                # epsilon-greedy action selection below
                if np.random.binomial(1,self.epsilon):
                    a_idx = np.random.randint(self.game.n_actions)
                else:
                    values = self.nnet.outputter(s.reshape(self.nnet.image_shape))
                    a_idx = np.argmax(values[0])
                r = g.take_action(a_idx)
                stp1 = g.get_state()
                # Reshape states into shape expected by convnet. 
                self.memories.insert(Memory(
                    s.transpose(2,0,1).reshape(self.nnet.image_shape), 
                    a_idx, 
                    r, 
                    stp1.transpose(2,0,1).reshape(self.nnet.image_shape)
                ))
                s = stp1

                # TEST CLOOJ
                if self.game.iter_ctr %200 == 0:
                    print "move_n: %d, action: %d, reward: %f, status: %d" %(
                        self.game.iter_ctr, a_idx, r, self.game._STATUS
                    )
                
                # Minibatch update. 
                if e_idx > 0:
                    costs = [] # local for this iter. 
                    data = self.memories.sample(self.batch_size) # random (state, action, reward, nxt_state) sample from memory replay. 
                    data = [m.target_pair(self.nnet) for m in data] # convert above tuple into training data, label pair. 
                    for i in range(self.batch_size):
                        d = data[i]
                        costs.append(self.trainer(d[0], d[1], self.alpha)) # call trainer func
                    self._costs.append(np.mean(costs))
#            print "Game %d ends in %d iterations with status %d, reward %d." %(e_idx, self.game.iter_ctr, self.game._STATUS, r)

            # compute percent invalid actions.
            n_moves = g.iter_ctr
            rs = g.episode_rewards
            n_invalid = len(np.where(rs == np.array([-.02 for _ in range(len(rs))]))[0])
            pct_invalid = float(n_invalid)/n_moves
            self._pct_invalids.append(pct_invalid)
            print "Pct Invalid: %f" %pct_invalid
            g.reset()
            self.epsilon -= ep_anneal_rate
            self.batch_size += self.batch_size_incr
            if e_idx > 0:
                self.alpha -= alpha_anneal_rate
Beispiel #11
0
class DQN:
    def __init__(self):
        self.batch_size = 64  # How many experiences to use for each training step
        self.train_frequency = 5  # How often you update the network
        self.num_epochs = 20  # How many epochs to train when updating the network
        self.y = 0.99  # Discount factor
        self.prob_random_start = 0.6  # Starting chance of random action
        self.prob_random_end = 0.1  # Ending chance of random action
        self.annealing_steps = 1000.  # Steps of training to reduce from start_e -> end_e
        self.max_num_episodes = 10000  # Max number of episodes you are allowes to played to train the game
        self.min_pre_train_episodes = 100  # Number of episodes played with random actions before to start training.
        self.max_num_step = 50  # Maximum allowed episode length
        self.goal = 15 # Number of rewards we want to achieve while playing a game.

        # Set env
        self.env = gameEnv(partial=False, size=5)

        # Reset everything from keras session
        K.clear_session()

        # Setup our Q-networks
        self.main_qn = Qnetwork()
        self.target_qn = Qnetwork()

        # Setup our experience replay
        self.experience_replay = ExperienceReplay()

    def update_target_graph(self):
        # TODO
        return

    def choose_action(self, state, prob_random, num_episode):
        # TODO
        return action

    def run_one_episode(self, num_episode, prob_random):
        # TODO
        return experiences_episode

    def generate_target_q(self, train_state, train_action, train_reward, train_next_state, train_done):
        # TODO
        return target_q

    def train_one_step(self):
        # Train batch is [[state,action,reward,next_state,done],...]
        train_batch = self.experience_replay.sample(self.batch_size)

        # Separate the batch into numpy array for each compents
        train_state = np.array([x[0] for x in train_batch])
        train_action = np.array([x[1] for x in train_batch])
        train_reward = np.array([x[2] for x in train_batch])
        train_next_state = np.array([x[3] for x in train_batch])
        train_done = np.array([x[4] for x in train_batch])

        # Generate target Q
        target_q = self.generate_target_q(
            train_state=train_state,
            train_action=train_action,
            train_reward=train_reward,
            train_next_state=train_next_state,
            train_done=train_done
        )

        # Train the main model
        loss = self.main_qn.model.train_on_batch(train_state, target_q)
        return loss

    def train(self):

        # Make the networks equal
        self.update_target_graph()

        # We'll begin by acting complete randomly. As we gain experience and improve,
        # we will begin reducing the probability of acting randomly, and instead
        # take the actions that our Q network suggests
        prob_random = self.prob_random_start
        prob_random_drop = (self.prob_random_start - self.prob_random_end) / self.annealing_steps

        # Init variable
        num_steps = []  # Tracks number of steps per episode
        rewards = []  # Tracks rewards per episode
        print_every = 50  # How often to print status
        losses = [0]  # Tracking training losses
        num_episode = 0

        while True:
            # Run one episode
            experiences_episode = self.run_one_episode(num_episode, prob_random)

            # Save the episode in the replay buffer
            self.experience_replay.add(experiences_episode)

            # If we have play enoug episode. Start the training
            if num_episode > self.min_pre_train_episodes:

                # Drop the probability of a random action if wi didn't reach the prob_random_end value
                if prob_random > self.prob_random_end:
                    prob_random -= prob_random_drop

                # Every train_frequency iteration, train the model
                if num_episode % self.train_frequency == 0:
                    for num_epoch in range(self.num_epochs):
                        loss = self.train_one_step()
                        losses.append(loss)

                    # Update the target model with values from the main model
                    self.update_target_graph()

            # Increment the episode
            num_episode += 1
            num_steps.append(len(experiences_episode))
            rewards.append(sum([e[2] for e in experiences_episode]))

            # Print Info
            if num_episode % print_every == 0:
                # datetime object containing current date and time
                now = datetime.now()
                dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
                mean_loss = np.mean(losses[-(print_every * self.num_epochs):])
                print("{} - Num episode: {} Mean reward: {:0.4f} Prob random: {:0.4f}, Loss: {:0.04f}".format(
                    dt_string, num_episode, np.mean(rewards[-print_every:]), prob_random, mean_loss))

            # Stop Condition
            if np.mean(rewards[-print_every:]) >= self.goal:
                now = datetime.now()
                dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
                mean_loss = np.mean(losses[-(print_every * self.num_epochs):])
                print("{} - Num episode: {} Mean reward: {:0.4f} Prob random: {:0.4f}, Loss: {:0.04f}".format(
                    dt_string, num_episode, np.mean(rewards[-print_every:]), prob_random, mean_loss))
                print("Training complete because we reached goal rewards.")
                break
            if num_episode > self.max_num_episodes:
                print("Training Stop because we reached max num of episodes")
                break
Beispiel #12
0
def main(_):
    # Reproducability
    tf.reset_default_graph()
    np.random.seed(cfg.random_seed)
    tf.set_random_seed(cfg.random_seed)

    # Logging
    summary_writer = tf.summary.FileWriter(cfg.log_dir)

    if not cfg.evaluate and not tf.gfile.Exists(cfg.save_dir):
        tf.gfile.MakeDirs(cfg.save_dir)
    else:
        assert tf.gfile.Exists(cfg.save_dir)

    # TODO handel this
    episode_results_path = os.path.join(cfg.log_dir, "episodeResults.csv")
    episode_results = tf.gfile.GFile(episode_results_path, "w")
    episode_results.write("model_freq={},save_dir={}".format(
        cfg.model_freq, cfg.save_dir))
    episode_results.write("episode,reward,steps\n")
    episode_results.flush()

    # Setup ALE and DQN graph
    obs_shape = (84, 84, 1)
    input_height, input_width, _ = obs_shape

    dqn = DQN(input_height, input_width, cfg.num_actions)

    # Global step
    global_step = tf.train.get_or_create_global_step()
    increment_step = tf.assign_add(global_step, 1)

    # Save all variables
    vars_to_save = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                     scope="agent/q")
    vars_to_save.append(global_step)
    saver = tf.train.Saver(var_list=vars_to_save)

    # Handle loading specific variables
    sess_config = tf.ConfigProto()
    sess_config.gpu_options.allow_growth = True
    sess = tf.Session(config=sess_config)

    restore_or_initialize_weights(sess, dqn, saver)
    sess.run(dqn.copy_to_target)

    if cfg.evaluate:
        # if in evaluation mode, saver is no longer needed
        saver = None

    # ##### Restoring AEs ########
    if not cfg.evaluate:
        vaes = create_generative_models(sess)
        image_summaries = []
        image_summaries_ph = tf.placeholder(tf.float32,
                                            shape=(4, 84, 84, 4),
                                            name="image_summaries_placeholder")
        for i in range(4):
            for j in range(4):
                image_summaries.append(
                    tf.summary.image(
                        "VAE_OUT_{}_{}".format(i, j),
                        tf.reshape(image_summaries_ph[i, :, :, j],
                                   (1, 84, 84, 1))))
    # ############################

    if not cfg.evaluate:
        summary_writer.add_graph(tf.get_default_graph())
        summary_writer.add_graph(vaes[0].graph)
        summary_writer.add_graph(vaes[1].graph)
        summary_writer.add_graph(vaes[2].graph)

        summary_writer.flush()

    # Initialize ALE
    postprocess_frame = lambda frame: sess.run(dqn.process_frame,
                                               feed_dict={dqn.image: frame})
    env = AtariEnvironment(obs_shape, postprocess_frame)

    # Replay buffer
    if not cfg.evaluate:
        replay_buffer = ExperienceReplay(cfg.replay_buffer_size, obs_shape)

    # Perform random policy to get some training data
    with tqdm(total=cfg.seed_frames,
              disable=cfg.disable_progress or cfg.evaluate) as pbar:
        seed_steps = 0
        while seed_steps * cfg.frame_skip < cfg.seed_frames and not cfg.evaluate:
            action = np.random.randint(cfg.num_actions)
            reward, next_state, terminal = env.act(action)
            seed_steps += 1

            replay_buffer.append(next_state[:, :, -1, np.newaxis], action,
                                 reward, terminal)

            if terminal:
                pbar.update(env.episode_frames)
                env.reset(inc_episode_count=False)

    if cfg.evaluate:
        assert cfg.max_episode_count > 0
    else:
        assert len(replay_buffer) >= cfg.seed_frames // cfg.frame_skip

    # Main training loop
    steps = tf.train.global_step(sess, global_step)
    env.reset(inc_episode_count=False)
    terminal = False

    total = cfg.max_episode_count if cfg.evaluate else cfg.num_frames
    with tqdm(total=total, disable=cfg.disable_progress) as pbar:
        # Loop while we haven't observed our max frame number
        # If we are at our max frame number we will finish the current episode
        while (not (
                # We must be evaluating or observed the last frame
                # As well as be terminal
                # As well as seen the maximum episode number
            (steps * cfg.frame_skip > cfg.num_frames or cfg.evaluate)
                and terminal and env.episode_count >= cfg.max_episode_count)):
            # Epsilon greedy policy with epsilon annealing
            if not cfg.evaluate and steps * cfg.frame_skip < cfg.eps_anneal_over:
                # Only compute epsilon step while we're still annealing epsilon
                epsilon = cfg.eps_initial - steps * (
                    (cfg.eps_initial - cfg.eps_final) / cfg.eps_anneal_over)
            else:
                epsilon = cfg.eps_final

            # Epsilon greedy policy
            if np.random.uniform() < epsilon:
                action = np.random.randint(0, cfg.num_actions)
            else:
                action = sess.run(dqn.action, feed_dict={dqn.S: [env.state]})

            # Perform environment step
            steps = sess.run(increment_step)
            reward, next_state, terminal = env.act(action)

            if not cfg.evaluate:
                replay_buffer.append(next_state[:, :, -1, np.newaxis], action,
                                     reward, terminal)

                # Sample and do gradient updates
                if steps % cfg.learning_freq == 0:
                    placeholders = [
                        dqn.S,
                        dqn.actions,
                        dqn.rewards,
                        dqn.S_p,
                        dqn.terminals,
                    ]
                    batch = replay_buffer.sample(cfg.batch_size)
                    train_op = [dqn.train]
                    if steps % (cfg.learning_freq * cfg.model_freq) == 0:
                        experience_batch = batch
                        batch = imagined_batch(vaes, batch[1])
                        if steps / (cfg.learning_freq * cfg.model_freq) < 10:
                            placeholders.append(image_summaries_ph)
                            batch = list(batch)
                            batch.append(batch[0][
                                np.random.randint(0, 32, size=4), :, :, :])
                            train_op.extend(image_summaries)
                    if steps % cfg.log_summary_every:
                        train_op.append(dqn.summary)
                    result = sess.run(
                        train_op,
                        feed_dict=dict(zip(placeholders, batch)),
                    )
                    if len(result) > 1:
                        for i in range(1, len(result)):
                            summary_writer.add_summary(result[i],
                                                       global_step=steps)
                if steps % cfg.target_update_every == 0:
                    sess.run([dqn.copy_to_target])
                if steps % cfg.model_chkpt_every == 0:
                    saver.save(sess,
                               "%s/model_epoch_%04d" % (cfg.save_dir, steps))

            if terminal:
                episode_results.write("%d,%d,%d\n" %
                                      (env.episode_count, env.episode_reward,
                                       env.episode_frames))
                episode_results.flush()
                # Log episode summaries to Tensorboard
                add_simple_summary(summary_writer, "episode/reward",
                                   env.episode_reward, env.episode_count)
                add_simple_summary(summary_writer, "episode/frames",
                                   env.episode_frames, env.episode_count)

                pbar.update(env.episode_frames if not cfg.evaluate else 1)
                env.reset()

    episode_results.close()
    tf.logging.info("Finished %d %s" % (
        cfg.max_episode_count if cfg.evaluate else cfg.num_frames,
        "episodes" if cfg.evaluate else "frames",
    ))
Beispiel #13
0
class DQN_agent():
    def __init__(self):
        self.eps = 0.1
        self.env = GridEnv(3)
        self.batch_size = 20

        if prioritized_replay and replay_type == "proportional":
            self.replay = ProportionalReplay(max_buffer_size,
                                             prioritized_replay_alpha)
        elif prioritized_replay and replay_type == "ranked":
            N_list = [self.batch_size] + [
                int(x) for x in np.linspace(100, max_buffer_size, 5)
            ]
            save_quantiles(N_list=N_list,
                           k=self.batch_size,
                           alpha=prioritized_replay_alpha)
            self.replay = RankBasedReplay(max_buffer_size,
                                          prioritized_replay_alpha)
        else:
            self.replay = ExperienceReplay(
                max_buffer_size)  # passing size of buffer

        # define graph
        self.inputs = tf.placeholder(tf.float32,
                                     shape=(None, self.env.state_size))
        self.target_values = tf.placeholder(tf.float32, shape=(None, ))
        self.actions = tf.placeholder(tf.int32, shape=(None, ))
        self.is_weights = tf.placeholder(tf.float32, shape=(
            None, ))  # importance sampling weights for prioritized replay
        self.Q_out_op, self.Q_update_op, self.td_error_op = self.build_graph(
        )  # build main network
        self.target_Q_out_op, _, _ = self.build_graph(
            'target')  # build identical target network

        self.init_op = tf.global_variables_initializer()
        self.sess = tf.Session()

    def build_graph(self, scope='main'):
        with tf.variable_scope(scope):
            h = tf.layers.dense(self.inputs,
                                16,
                                activation=tf.nn.relu,
                                name="h")
            outputs = tf.layers.dense(h,
                                      self.env.num_actions,
                                      activation=tf.nn.softmax,
                                      name="outputs")

            # everything is now the same shape (batch_size, num_actions)
            # nonzero error only for selected actions
            action_mask = tf.one_hot(self.actions,
                                     self.env.num_actions,
                                     on_value=True,
                                     off_value=False)
            targets = tf.tile(tf.expand_dims(self.target_values, 1),
                              [1, self.env.num_actions])
            target_outputs = tf.where(
                action_mask, targets, outputs
            )  # takes target value where mask is true. takes outputs value otherwise

            td_error = target_outputs - outputs  # only one element in each row is non-zero
            weights = tf.tile(tf.expand_dims(self.is_weights, 1),
                              [1, self.env.num_actions
                               ])  # all 1s when not using priority replay
            weighted_td_error = weights * td_error  # element-wise multiplication

            loss = tf.reduce_sum(tf.square(weighted_td_error))
            update = tf.train.AdamOptimizer().minimize(loss)
        return outputs, update, td_error

    def train(self):
        steps_per_ep = np.zeros(episodes)
        for episode in range(episodes):
            print(episode)
            self.env.reset()
            state = self.env.state
            done = False
            num_steps = 0
            while not done:
                num_steps += 1
                action = self.get_eps_action(state, self.eps)
                next_state, reward, done, _ = self.env.step(action)
                self.replay.add((state, action, reward, next_state,
                                 done))  # store in experience replay

                # sample from experience replay
                if prioritized_replay:
                    beta = beta0 + episode * (
                        1 - beta0
                    ) / episodes  # linear annealing schedule for IS weights
                    states, actions, rewards, next_states, dones, weights, indices = self.replay.sample(
                        self.batch_size, beta)
                    self.net_update(states, actions, rewards, next_states,
                                    dones, weights, indices)  # qlearning
                else:
                    states, actions, rewards, next_states, dones = self.replay.sample(
                        self.batch_size)
                    self.net_update(states, actions, rewards, next_states,
                                    dones)  # qlearning

                # slowly update target network
                if num_steps % update_every == 0:
                    self.target_net_update()

                # sort max heap periodically
                if num_steps % sort_every == 0:
                    if prioritized_replay and replay_type == "ranked":
                        self.replay.sort()

                state = next_state
            steps_per_ep[episode] = num_steps
        return steps_per_ep

    # from https://tomaxent.com/2017/07/09/Using-Tensorflow-and-Deep-Q-Network-Double-DQN-to-Play-Breakout/
    def target_net_update(self):
        # get sorted lists of parameters in each of the networks
        main_params = [
            t for t in tf.trainable_variables() if t.name.startswith("main")
        ]
        main_params = sorted(main_params, key=lambda v: v.name)
        target_params = [
            t for t in tf.trainable_variables() if t.name.startswith("target")
        ]
        target_params = sorted(target_params, key=lambda v: v.name)

        update_ops = []
        for main_v, target_v in zip(main_params, target_params):
            op = target_v.assign(main_v)
            update_ops.append(op)

        self.sess.run(update_ops)

    # minibatch qlearning
    def net_update(self,
                   states,
                   actions,
                   rewards,
                   next_states,
                   dones,
                   weights=None,
                   indices=None):
        not_dones = np.logical_not(dones)

        # create a shape (batch_size, ) array of target values
        target_values = rewards.astype(
            float)  # np.array of shape (batch_size, )
        next_inputs = next_states[
            not_dones]  # np.array of shape (#done, state_size)
        next_Qs = self.sess.run(self.Q_out_op,
                                {self.inputs: next_inputs
                                 })  # np.array of shape (#done, num_actions)
        max_Qs = np.max(next_Qs, axis=1)  # np.array of shape (#done,)
        target_values[not_dones] += gamma * max_Qs

        # if not using prioritized replay
        if weights is None:
            weights = np.ones(self.batch_size)

        # compute gradients and update parameters
        _, td_error = self.sess.run([self.Q_update_op, self.td_error_op], \
                {self.inputs: states, self.target_values: target_values, self.actions: actions, self.is_weights: weights})

        # update priority replay priorities
        if indices is not None:
            td_error = td_error.ravel()[np.flatnonzero(
                td_error)]  # shape (batch_size, )
            self.replay.update_priorities(
                indices,
                np.abs(td_error) + 1e-3
            )  # add small number to prevent never sampling 0 error transitions

    # returns eps-greedy action with respect to Q
    def get_eps_action(self, state, eps):
        if self.env.np_random.uniform() < eps:
            action = self.env.sample()
        else:
            Q = self.sess.run(self.Q_out_op, {self.inputs: np.array([state])})
            max_actions = np.where(np.ravel(Q) == Q.max())[0]
            action = self.env.np_random.choice(
                max_actions)  # to select argmax randomly
        return action