class DQN:
    """ Implementation of deep q learning algorithm """
    def __init__(self):

        self.prob_random = 1.0  # Probability to play random action
        self.y = .99  # Discount factor
        self.batch_size = 64  # How many experiences to use for each training step
        self.prob_random_end = .01  # Ending chance of random action
        self.prob_random_decay = .996  # Decrease decay of the prob random
        self.max_episode = 300  # Max number of episodes you are allowes to played to train the game
        self.expected_goal = 200  # Expected goal

        self.dnn = DNN()
        self.env = gym.make('CartPole-v0')

        self.memory = ExperienceReplay(buffer_size=10000)

        self.metadata = [
        ]  # we will store here info score, at the end of each episode

    def choose_action(self, state, prob_random):
        if np.random.rand() <= prob_random:
            action = np.random.randint(self.env.action_space.n)
        else:
            action = np.argmax(self.dnn.model.predict(state))
        return action

    def run_one_step(self, state):
        action = self.choose_action(state, self.prob_random)
        next_state, reward, done, _ = self.env.step(action)
        next_state = np.expand_dims(next_state, axis=0)
        return state, action, reward, next_state, done

    def generate_target_q(self, train_state, train_action, train_reward,
                          train_next_state, train_done):

        # Our predictions (actions to take) from the main Q network
        target_q = self.dnn.model.predict(train_state)

        # Tells us whether game over or not
        # We will multiply our rewards by this value
        # to ensure we don't train on the last move
        train_gameover = train_done == 0

        # Q value of the next state based on action
        target_q_next_state = self.dnn.model.predict(train_next_state)
        train_next_state_values = np.max(target_q_next_state[range(
            self.batch_size)],
                                         axis=1)

        # Reward from the action chosen in the train batch
        actual_reward = train_reward + (self.y * train_next_state_values *
                                        train_gameover)
        target_q[range(self.batch_size), train_action] = actual_reward
        return target_q

    def train_one_step(self):

        batch_data = self.memory.sample(self.batch_size)
        train_state = np.array([i[0] for i in batch_data])
        train_action = np.array([i[1] for i in batch_data])
        train_reward = np.array([i[2] for i in batch_data])
        train_next_state = np.array([i[3] for i in batch_data])
        train_done = np.array([i[4] for i in batch_data])

        # These lines remove useless dimension of the matrix
        train_state = np.squeeze(train_state)
        train_next_state = np.squeeze(train_next_state)

        # Generate target Q
        target_q = self.generate_target_q(train_state=train_state,
                                          train_action=train_action,
                                          train_reward=train_reward,
                                          train_next_state=train_next_state,
                                          train_done=train_done)

        loss = self.dnn.model.train_on_batch(train_state, target_q)
        return loss

    def train(self):
        scores = []
        for e in range(self.max_episode):
            # Init New episode
            state = self.env.reset()
            state = np.expand_dims(state, axis=0)
            episode_score = 0
            while True:
                state, action, reward, next_state, done = self.run_one_step(
                    state)
                self.memory.add(
                    experiences=[[state, action, reward, next_state, done]])
                episode_score += reward
                state = next_state
                if len(self.memory.buffer) > self.batch_size:
                    self.train_one_step()
                    if self.prob_random > self.prob_random_end:
                        self.prob_random *= self.prob_random_decay
                if done:
                    now = datetime.now()
                    dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
                    self.metadata.append(
                        [now, e, episode_score, self.prob_random])
                    print(
                        "{} - episode: {}/{}, score: {:.1f} - prob_random {:.3f}"
                        .format(dt_string, e, self.max_episode, episode_score,
                                self.prob_random))
                    break
            scores.append(episode_score)

            # Average score of last 100 episode
            means_last_10_scores = np.mean(scores[-10:])
            if means_last_10_scores == self.expected_goal:
                print('\n Task Completed! \n')
                break
            print("Average over last 10 episode: {0:.2f} \n".format(
                means_last_10_scores))
        print("Maximum number of episode played: %d" % self.max_episode)
class DQN:
    def __init__(self):
        self.batch_size = 64  # How many experiences to use for each training step
        self.train_frequency = 5  # How often you update the network
        self.num_epochs = 20  # How many epochs to train when updating the network
        self.y = 0.99  # Discount factor
        self.prob_random_start = 0.6  # Starting chance of random action
        self.prob_random_end = 0.1  # Ending chance of random action
        self.annealing_steps = 1000.  # Steps of training to reduce from start_e -> end_e
        self.max_num_episodes = 10000  # Max number of episodes you are allowes to played to train the game
        self.min_pre_train_episodes = 100  # Number of episodes played with random actions before to start training.
        self.max_num_step = 50  # Maximum allowed episode length
        self.goal = 15  # Number of rewards we want to achieve while playing a game.

        # Set env
        self.env = gameEnv(partial=False, size=5)

        # Reset everything from keras session
        K.clear_session()

        # Setup our Q-networks
        self.main_qn = Qnetwork()
        self.target_qn = Qnetwork()

        # Setup our experience replay
        self.experience_replay = ExperienceReplay()

    def update_target_graph(self):
        updated_weights = np.array(self.main_qn.model.get_weights())
        self.target_qn.model.set_weights(updated_weights)

    def choose_action(self, state, prob_random, num_episode):
        if np.random.rand() < prob_random or \
                num_episode < self.min_pre_train_episodes:
            # Act randomly based on prob_random or if we
            # have not accumulated enough pre_train episodes
            action = np.random.randint(self.env.actions)
        else:
            # Decide what action to take from the Q network
            # First add one dimension to the netword to fit expected dimension of the network
            state = np.expand_dims(state, axis=0)
            action = np.argmax(self.main_qn.model.predict(state))
        return action

    def run_one_episode(self, num_episode, prob_random):
        # Create an experience replay for the current episode.
        experiences_episode = []

        # Get the game state from the environment
        state = self.env.reset()

        done = False  # Game is complete
        cur_step = 0  # Running sum of number of steps taken in episode

        while cur_step < self.max_num_step and not done:
            cur_step += 1
            action = self.choose_action(state=state,
                                        prob_random=prob_random,
                                        num_episode=num_episode)

            # Take the action and retrieve the next state, reward and done
            next_state, reward, done = self.env.step(action)

            # Setup the experience to be stored in the episode buffer
            experience = [state, action, reward, next_state, done]

            # Store the experience in the episode buffer
            experiences_episode.append(experience)

            # Update the state
            state = next_state

        return experiences_episode

    def generate_target_q(self, train_state, train_action, train_reward,
                          train_next_state, train_done):
        # Our predictions (actions to take) from the main Q network
        target_q = self.main_qn.model.predict(train_state)

        # Tells us whether game over or not
        # We will multiply our rewards by this value
        # to ensure we don't train on the last move
        train_gameover = train_done == 0

        # Q value of the next state based on action
        target_q_next_state = self.target_qn.model.predict(train_next_state)
        train_next_state_values = np.max(target_q_next_state[range(
            self.batch_size)],
                                         axis=1)

        # Reward from the action chosen in the train batch
        actual_reward = train_reward + (self.y * train_next_state_values *
                                        train_gameover)
        target_q[range(self.batch_size), train_action] = actual_reward
        return target_q

    def train_one_step(self):
        # Train batch is [[state,action,reward,next_state,done],...]
        train_batch = self.experience_replay.sample(self.batch_size)

        # Separate the batch into numpy array for each compents
        train_state = np.array([x[0] for x in train_batch])
        train_action = np.array([x[1] for x in train_batch])
        train_reward = np.array([x[2] for x in train_batch])
        train_next_state = np.array([x[3] for x in train_batch])
        train_done = np.array([x[4] for x in train_batch])

        # Generate target Q
        target_q = self.generate_target_q(train_state=train_state,
                                          train_action=train_action,
                                          train_reward=train_reward,
                                          train_next_state=train_next_state,
                                          train_done=train_done)

        # Train the main model
        loss = self.main_qn.model.train_on_batch(train_state, target_q)
        return loss

    def train(self):

        # Make the networks equal
        self.update_target_graph()

        # We'll begin by acting complete randomly. As we gain experience and improve,
        # we will begin reducing the probability of acting randomly, and instead
        # take the actions that our Q network suggests
        prob_random = self.prob_random_start
        prob_random_drop = (self.prob_random_start -
                            self.prob_random_end) / self.annealing_steps

        # Init variable
        num_steps = []  # Tracks number of steps per episode
        rewards = []  # Tracks rewards per episode
        print_every = 50  # How often to print status
        losses = [0]  # Tracking training losses
        num_episode = 0

        while True:
            # Run one episode
            experiences_episode = self.run_one_episode(num_episode,
                                                       prob_random)

            # Save the episode in the replay buffer
            self.experience_replay.add(experiences_episode)

            # If we have play enoug episode. Start the training
            if num_episode > self.min_pre_train_episodes:

                # Drop the probability of a random action if wi didn't reach the prob_random_end value
                if prob_random > self.prob_random_end:
                    prob_random -= prob_random_drop

                # Every train_frequency iteration, train the model
                if num_episode % self.train_frequency == 0:
                    for num_epoch in range(self.num_epochs):
                        loss = self.train_one_step()
                        losses.append(loss)

                    # Update the target model with values from the main model
                    self.update_target_graph()

            # Increment the episode
            num_episode += 1
            num_steps.append(len(experiences_episode))
            rewards.append(sum([e[2] for e in experiences_episode]))

            # Print Info
            if num_episode % print_every == 0:
                # datetime object containing current date and time
                now = datetime.now()
                dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
                mean_loss = np.mean(losses[-(print_every * self.num_epochs):])
                print(
                    "{} - Num episode: {} Mean reward: {:0.4f} Prob random: {:0.4f}, Loss: {:0.04f}"
                    .format(dt_string, num_episode,
                            np.mean(rewards[-print_every:]), prob_random,
                            mean_loss))

            # Stop Condition
            if np.mean(rewards[-print_every:]) >= self.goal:
                now = datetime.now()
                dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
                mean_loss = np.mean(losses[-(print_every * self.num_epochs):])
                print(
                    "{} - Num episode: {} Mean reward: {:0.4f} Prob random: {:0.4f}, Loss: {:0.04f}"
                    .format(dt_string, num_episode,
                            np.mean(rewards[-print_every:]), prob_random,
                            mean_loss))
                print("Training complete because we reached goal rewards.")
                break
            if num_episode > self.max_num_episodes:
                print("Training Stop because we reached max num of episodes")
                break
Ejemplo n.º 3
0
        # get action
        if (config.total_step < config.args.num_pretrain_step
                or np.random.rand(1) < epsilon):
            action = np.random.randint(env.num_action)
            num_random_step += 1

        else:
            action = qnet.get_actions(state)[0]

        # get after take action
        newstate, reward, done, _ = env.step(action)
        if (newstate == []):
            print("Terminate")
            break
        replay_ep.add(
            np.reshape(np.array([state, action, reward, done, newstate]),
                       [1, 5]))
        # train
        if config.total_step > config.args.num_pretrain_step:
            if epsilon > config.args.end_epsilon:
                epsilon -= epsilon_decay

            if config.total_step % config.args.online_update_freq == 0:
                train_batch = replay.sample(config.args.batch_size)
                loss = qnet.learn_on_minibatch(train_batch, config.args.gamma)
                sys.stdout.write(
                    "\rTrain step at {}th step | loss {} | epsilon {}".format(
                        config.total_step, loss, epsilon))
                sys.stdout.flush()

            if config.total_step % config.args.target_update_freq == 0:
Ejemplo n.º 4
0
                next_state = preprocess(g.state)
                next_state = np.expand_dims(next_state, axis=2)  # channel axis

                action_onehot = np.zeros(NUM_ACTIONS)
                action_onehot[action] = 1

                state = np.expand_dims(state, axis=2)

                if reward > 0:
                    extra_bonus = 0
                    if np.max(state) == state[0, 0]:
                        extra_bonus += math.log2(2**20)
                        if np.argmax(np.sum(state, axis=1)):
                            extra_bonus += math.log2(2**20)
                    reward = math.log2(reward) + num_merges + extra_bonus
                replay.add((state, action_onehot, reward, next_state))

            if g.is_game_over():
                logger.log("stats/score", g.score, i)
                logger.log("stats/num_moves", num_moves, i)
                logger.log("stats/max_tile", np.max(g.state), i)
                logger.log("stats/best_score", best_score, i)

                logger.log("settings/epsilon", epsilon, i)
                logger.log("settings/num_random_moves", num_random_moves, i)
                logger.log("settings/perc_random_moves",
                           num_random_moves / num_moves, i)
                logger.log("settings/experience", len(replay), i)

                reward = 0
                replay.add(
Ejemplo n.º 5
0
class Agent:
    def __init__(self, s_size, a_size, seed):
        """

        Parameters:
            s_size (int): dimension of each state
            a_size (int): dimension of each action
            seed (int): random seed
        """
        self.s_size = s_size
        self.a_size = a_size
        self.seed = random.seed(seed)

        # Initialize both the Q-networks
        self.local_dqn = Model(s_size, a_size, seed).to(device)
        self.target_dqn = Model(s_size, a_size, seed).to(device)
        self.optimizer = optim.Adam(self.local_dqn.parameters(),
                                    lr=c.LEARNING_RATE)

        # Initialize experience deque
        self.buffer = ExperienceReplay(a_size, c.REPLAY_BUFFER_SIZE,
                                       c.BATCH_SIZE, seed)

        # Time step counter used for updating as per UPDATE_FREQUENCY
        self.t_step = 0

    def step(self, s, a, r, s_next, done, transfer_method):
        # Add experience to dequeue
        self.buffer.add(s, a, r, s_next, done)

        # Learn if UPDATE_FREQUENCY matched.
        self.t_step = (self.t_step + 1) % c.UPDATE_FREQUENCY
        if self.t_step == 0:
            # Get random experiences to learn from.
            if len(self.buffer) > c.BATCH_SIZE:
                es = self.buffer.sample()
                self.learn(es, transfer_method, c.GAMMA)

    def act(self, state, transfer_method, eps=0.):
        """Returns actions for given state as per current policy.

        Parameters:
            state (array_like): current state
            isTransfer (int): 0 if pre-trained weights to be used, int otherwise
            eps (float): epsilon, for exploration
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.local_dqn.eval()
        with torch.no_grad():
            a_values = self.local_dqn(state, transfer_method)
        self.local_dqn.train()

        # Generate a random number. If larger than epsilon pick greedy, or random otherwise
        if random.random() > eps:
            return np.argmax(a_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.a_size))

    def learn(self, es, transfer_method, gamma):
        """Update parameters based on experiences.

        Parameters:
            es (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        s_, a_, r_, s_next_, d_ = es

        # Max predicted Q-values
        target_Q_next = self.target_dqn(
            s_next_, transfer_method).detach().max(1)[0].unsqueeze(1)

        # Target Q-value
        target_Q = r_ + (gamma * target_Q_next * (1 - d_))

        # Expected Q-vales
        expected_Q = self.local_dqn(s_, transfer_method).gather(1, a_)

        loss = F.mse_loss(expected_Q, target_Q)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target network
        update(self.local_dqn, self.target_dqn, c.TAU)
Ejemplo n.º 6
0
class DQNAgent:
    def __init__(self,
                 env,
                 net_update_rate: int = 25,
                 exploration_rate: float = 1.0,
                 exploration_decay: float = 0.00005):
        # set hyper parameters
        self.exploration_rate = exploration_rate
        self.exploration_decay = exploration_decay
        self.net_updating_rate = net_update_rate

        # set environment
        self.env = env
        self.state_shape = env.get_state_shape()
        self.action_shape = env.get_action_shape()

        # the number of experience per batch for batch learning
        # Experience Replay for batch learning
        self.exp_rep = ExperienceReplay()

        # Deep Q Network
        self.net = None

    def set_model(self, model):
        """ Sets the model the agent is used to train. Receives a compiled tf Model with
            input_shape = env.observation_space and output_shape = env.action_s pace"""
        self.net = DoubleDQN(model)

    def get_action(self, state: np.ndarray, eps=0) -> int:
        """Given a state returns a random action with probability eps, and argmax(q_net(state)) with probability 1-eps.
           (only legal actions are considered)"""
        if self.net is None:
            raise NotImplementedError(
                'agent.get_action called before model was not initiated.\n Please set the agent\'s model'
                ' using the set_model method. You can access the state and action shapes using '
                'agent\'s methods \'get_state_shape\' and \'get_action_shape\''
            )
        legal_actions = self.env.get_legal_actions(state)

        if np.random.random() >= eps:  # Exploitation

            # Calculate the Q-value of each action
            q_values = self.net.predict(state[np.newaxis, ...],
                                        np.expand_dims(legal_actions, 0))

            # Make sure we only choose between available actions
            legal_actions = np.logical_and(legal_actions,
                                           q_values == np.max(q_values))

        return np.random.choice(np.flatnonzero(legal_actions))

    def update_net(self, batch_size: int):
        """ if there are more than batch_size experiences, Optimizes the network's weights using the Double-Q-learning
         algorithm with a batch of experiences, else returns"""
        if self.exp_rep.get_num() < batch_size:
            return
        batch = self.exp_rep.get_batch(batch_size)
        self.net.fit(*batch)

    def train(self,
              episodes: int,
              path: str,
              checkpoint_rate=100,
              batch_size: int = 64,
              exp_decay_func=lambda exp_rate, exp_decay, i: 0.01 +
              (exp_rate - 0.01) * np.exp(exp_decay * (i + 1)),
              show_progress=False):
        """
        Runs a training session for the agent
        :param episodes: number of episodes to train.
        :param path: a path to a directory where the trained weights will be saved.
        :param batch_size: number of experiences to learn from in each net_update.
        """
        if self.net is None:
            raise NotImplementedError(
                'agent.train called before model was not initiated.\n Please set the agent\'s model'
                ' using the set_model method. You can access the state and action shapes using '
                'agent\'s methods \'get_state_shape\' and \'get_action_shape\''
            )
        # set hyper parameters
        exploration_rate = self.exploration_rate
        total_rewards = []
        # start training
        for episode in tqdm(range(episodes)):
            state = self.env.reset()  # Reset the environment for a new episode
            step, episode_reward = 0, 0
            run = True
            # Run until max actions is reached or episode has ended
            while run:

                step += 1
                # choose a current action using epsilon greedy exploration
                action = self.get_action(state, exploration_rate)

                # apply the chosen action to the environment and observe the next_state and reward
                obs = self.env.step(action)
                next_state, reward, is_terminal = obs[:3]
                episode_reward += reward

                # Add experience to memory
                self.exp_rep.add(state, action, reward, next_state,
                                 self.env.get_legal_actions(state),
                                 is_terminal)

                # Optimize the DoubleQ-net
                self.update_net(batch_size)

                if is_terminal:  # The action taken led to a  terminal state
                    run = False

                if (step % self.net_updating_rate) == 0 and step > 0:
                    # update target network
                    self.net.align_target_model()
                state = next_state

            # Update total_rewards to keep track of progress
            total_rewards.append(episode_reward)
            # Update target network at the end of the episode
            self.net.align_target_model()
            # Update exploration rate -
            exploration_rate = exp_decay_func(exploration_rate,
                                              self.exploration_decay, episode)

            if episode % checkpoint_rate == 0 and self.exp_rep.get_num(
            ) > batch_size:
                self.save_weights(
                    os.path.join(path, f'episode_{episode}_weights'))

                if show_progress:  # Plot a moving average of last 10 episodes
                    self.plot_progress(total_rewards)

        # update the agents exploration rate in case more training is needed.
        self.exploration_rate = exploration_rate

        # saves the total_rewards as csv file to the path specified.
        with open(os.path.join(path, 'rewards.csv'), 'w') as reward_file:
            rewards = pd.DataFrame(total_rewards)
            rewards.to_csv(reward_file)
        self.save_weights(os.path.join(path, 'final_weights'))

    def plot_progress(self, total_rewards):
        w = np.ones(10) / 10
        moving_average = np.convolve(total_rewards, w, mode='valid')
        plt.plot(np.arange(len(moving_average)), moving_average)
        plt.title('Moving average of rewards across episodes')
        plt.xlabel('episodes')
        plt.ylabel('average reward over last 10 episodes')
        plt.show()

    def get_state_shape(self):
        return self.state_shape

    def get_action_shape(self):
        return self.action_shape

    # Handles saving\loading the model as explained here: https://www.tensorflow.org/guide/keras/save_and_serialize
    def load_weights(self, path):
        self.net.load_weights(path)

    def save_weights(self, path):
        self.net.save_weights(path)

    def save_model(self, path):
        if self.net is None:
            raise NotImplementedError(
                'agent.save_model was called before model was not initiated.\n Please set the '
                'agent\'s model using the set_model method. You can access the state and action '
                'shapes using agent\'s methods \'get_state_shape\' and \'get_action_shape\''
            )
        self.net.save_model(path)

    def load_model(self, path):
        model = load_model(path)
        self.set_model(model)

    def to_json(self, **kwargs):
        if self.net is None:
            raise NotImplementedError(
                'agent.to_json was called before model was not initiated.\n Please set the '
                'agent\'s model using the set_model method. You can access the state and action '
                'shapes using agent\'s methods \'get_state_shape\' and \'get_action_shape\''
            )
        return self.net.to_json(**kwargs)

    def from_json(self, json_config):
        model = model_from_json(json_config)
        self.set_model(model)
Ejemplo n.º 7
0
class DQN:
    def __init__(self):
        self.batch_size = 64  # How many experiences to use for each training step
        self.train_frequency = 5  # How often you update the network
        self.num_epochs = 20  # How many epochs to train when updating the network
        self.y = 0.99  # Discount factor
        self.prob_random_start = 0.6  # Starting chance of random action
        self.prob_random_end = 0.1  # Ending chance of random action
        self.annealing_steps = 1000.  # Steps of training to reduce from start_e -> end_e
        self.max_num_episodes = 10000  # Max number of episodes you are allowes to played to train the game
        self.min_pre_train_episodes = 100  # Number of episodes played with random actions before to start training.
        self.max_num_step = 50  # Maximum allowed episode length
        self.goal = 15 # Number of rewards we want to achieve while playing a game.

        # Set env
        self.env = gameEnv(partial=False, size=5)

        # Reset everything from keras session
        K.clear_session()

        # Setup our Q-networks
        self.main_qn = Qnetwork()
        self.target_qn = Qnetwork()

        # Setup our experience replay
        self.experience_replay = ExperienceReplay()

    def update_target_graph(self):
        # TODO
        return

    def choose_action(self, state, prob_random, num_episode):
        # TODO
        return action

    def run_one_episode(self, num_episode, prob_random):
        # TODO
        return experiences_episode

    def generate_target_q(self, train_state, train_action, train_reward, train_next_state, train_done):
        # TODO
        return target_q

    def train_one_step(self):
        # Train batch is [[state,action,reward,next_state,done],...]
        train_batch = self.experience_replay.sample(self.batch_size)

        # Separate the batch into numpy array for each compents
        train_state = np.array([x[0] for x in train_batch])
        train_action = np.array([x[1] for x in train_batch])
        train_reward = np.array([x[2] for x in train_batch])
        train_next_state = np.array([x[3] for x in train_batch])
        train_done = np.array([x[4] for x in train_batch])

        # Generate target Q
        target_q = self.generate_target_q(
            train_state=train_state,
            train_action=train_action,
            train_reward=train_reward,
            train_next_state=train_next_state,
            train_done=train_done
        )

        # Train the main model
        loss = self.main_qn.model.train_on_batch(train_state, target_q)
        return loss

    def train(self):

        # Make the networks equal
        self.update_target_graph()

        # We'll begin by acting complete randomly. As we gain experience and improve,
        # we will begin reducing the probability of acting randomly, and instead
        # take the actions that our Q network suggests
        prob_random = self.prob_random_start
        prob_random_drop = (self.prob_random_start - self.prob_random_end) / self.annealing_steps

        # Init variable
        num_steps = []  # Tracks number of steps per episode
        rewards = []  # Tracks rewards per episode
        print_every = 50  # How often to print status
        losses = [0]  # Tracking training losses
        num_episode = 0

        while True:
            # Run one episode
            experiences_episode = self.run_one_episode(num_episode, prob_random)

            # Save the episode in the replay buffer
            self.experience_replay.add(experiences_episode)

            # If we have play enoug episode. Start the training
            if num_episode > self.min_pre_train_episodes:

                # Drop the probability of a random action if wi didn't reach the prob_random_end value
                if prob_random > self.prob_random_end:
                    prob_random -= prob_random_drop

                # Every train_frequency iteration, train the model
                if num_episode % self.train_frequency == 0:
                    for num_epoch in range(self.num_epochs):
                        loss = self.train_one_step()
                        losses.append(loss)

                    # Update the target model with values from the main model
                    self.update_target_graph()

            # Increment the episode
            num_episode += 1
            num_steps.append(len(experiences_episode))
            rewards.append(sum([e[2] for e in experiences_episode]))

            # Print Info
            if num_episode % print_every == 0:
                # datetime object containing current date and time
                now = datetime.now()
                dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
                mean_loss = np.mean(losses[-(print_every * self.num_epochs):])
                print("{} - Num episode: {} Mean reward: {:0.4f} Prob random: {:0.4f}, Loss: {:0.04f}".format(
                    dt_string, num_episode, np.mean(rewards[-print_every:]), prob_random, mean_loss))

            # Stop Condition
            if np.mean(rewards[-print_every:]) >= self.goal:
                now = datetime.now()
                dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
                mean_loss = np.mean(losses[-(print_every * self.num_epochs):])
                print("{} - Num episode: {} Mean reward: {:0.4f} Prob random: {:0.4f}, Loss: {:0.04f}".format(
                    dt_string, num_episode, np.mean(rewards[-print_every:]), prob_random, mean_loss))
                print("Training complete because we reached goal rewards.")
                break
            if num_episode > self.max_num_episodes:
                print("Training Stop because we reached max num of episodes")
                break
Ejemplo n.º 8
0
class DQN_agent():
    def __init__(self):
        self.eps = 0.1
        self.env = GridEnv(3)
        self.batch_size = 20

        if prioritized_replay and replay_type == "proportional":
            self.replay = ProportionalReplay(max_buffer_size,
                                             prioritized_replay_alpha)
        elif prioritized_replay and replay_type == "ranked":
            N_list = [self.batch_size] + [
                int(x) for x in np.linspace(100, max_buffer_size, 5)
            ]
            save_quantiles(N_list=N_list,
                           k=self.batch_size,
                           alpha=prioritized_replay_alpha)
            self.replay = RankBasedReplay(max_buffer_size,
                                          prioritized_replay_alpha)
        else:
            self.replay = ExperienceReplay(
                max_buffer_size)  # passing size of buffer

        # define graph
        self.inputs = tf.placeholder(tf.float32,
                                     shape=(None, self.env.state_size))
        self.target_values = tf.placeholder(tf.float32, shape=(None, ))
        self.actions = tf.placeholder(tf.int32, shape=(None, ))
        self.is_weights = tf.placeholder(tf.float32, shape=(
            None, ))  # importance sampling weights for prioritized replay
        self.Q_out_op, self.Q_update_op, self.td_error_op = self.build_graph(
        )  # build main network
        self.target_Q_out_op, _, _ = self.build_graph(
            'target')  # build identical target network

        self.init_op = tf.global_variables_initializer()
        self.sess = tf.Session()

    def build_graph(self, scope='main'):
        with tf.variable_scope(scope):
            h = tf.layers.dense(self.inputs,
                                16,
                                activation=tf.nn.relu,
                                name="h")
            outputs = tf.layers.dense(h,
                                      self.env.num_actions,
                                      activation=tf.nn.softmax,
                                      name="outputs")

            # everything is now the same shape (batch_size, num_actions)
            # nonzero error only for selected actions
            action_mask = tf.one_hot(self.actions,
                                     self.env.num_actions,
                                     on_value=True,
                                     off_value=False)
            targets = tf.tile(tf.expand_dims(self.target_values, 1),
                              [1, self.env.num_actions])
            target_outputs = tf.where(
                action_mask, targets, outputs
            )  # takes target value where mask is true. takes outputs value otherwise

            td_error = target_outputs - outputs  # only one element in each row is non-zero
            weights = tf.tile(tf.expand_dims(self.is_weights, 1),
                              [1, self.env.num_actions
                               ])  # all 1s when not using priority replay
            weighted_td_error = weights * td_error  # element-wise multiplication

            loss = tf.reduce_sum(tf.square(weighted_td_error))
            update = tf.train.AdamOptimizer().minimize(loss)
        return outputs, update, td_error

    def train(self):
        steps_per_ep = np.zeros(episodes)
        for episode in range(episodes):
            print(episode)
            self.env.reset()
            state = self.env.state
            done = False
            num_steps = 0
            while not done:
                num_steps += 1
                action = self.get_eps_action(state, self.eps)
                next_state, reward, done, _ = self.env.step(action)
                self.replay.add((state, action, reward, next_state,
                                 done))  # store in experience replay

                # sample from experience replay
                if prioritized_replay:
                    beta = beta0 + episode * (
                        1 - beta0
                    ) / episodes  # linear annealing schedule for IS weights
                    states, actions, rewards, next_states, dones, weights, indices = self.replay.sample(
                        self.batch_size, beta)
                    self.net_update(states, actions, rewards, next_states,
                                    dones, weights, indices)  # qlearning
                else:
                    states, actions, rewards, next_states, dones = self.replay.sample(
                        self.batch_size)
                    self.net_update(states, actions, rewards, next_states,
                                    dones)  # qlearning

                # slowly update target network
                if num_steps % update_every == 0:
                    self.target_net_update()

                # sort max heap periodically
                if num_steps % sort_every == 0:
                    if prioritized_replay and replay_type == "ranked":
                        self.replay.sort()

                state = next_state
            steps_per_ep[episode] = num_steps
        return steps_per_ep

    # from https://tomaxent.com/2017/07/09/Using-Tensorflow-and-Deep-Q-Network-Double-DQN-to-Play-Breakout/
    def target_net_update(self):
        # get sorted lists of parameters in each of the networks
        main_params = [
            t for t in tf.trainable_variables() if t.name.startswith("main")
        ]
        main_params = sorted(main_params, key=lambda v: v.name)
        target_params = [
            t for t in tf.trainable_variables() if t.name.startswith("target")
        ]
        target_params = sorted(target_params, key=lambda v: v.name)

        update_ops = []
        for main_v, target_v in zip(main_params, target_params):
            op = target_v.assign(main_v)
            update_ops.append(op)

        self.sess.run(update_ops)

    # minibatch qlearning
    def net_update(self,
                   states,
                   actions,
                   rewards,
                   next_states,
                   dones,
                   weights=None,
                   indices=None):
        not_dones = np.logical_not(dones)

        # create a shape (batch_size, ) array of target values
        target_values = rewards.astype(
            float)  # np.array of shape (batch_size, )
        next_inputs = next_states[
            not_dones]  # np.array of shape (#done, state_size)
        next_Qs = self.sess.run(self.Q_out_op,
                                {self.inputs: next_inputs
                                 })  # np.array of shape (#done, num_actions)
        max_Qs = np.max(next_Qs, axis=1)  # np.array of shape (#done,)
        target_values[not_dones] += gamma * max_Qs

        # if not using prioritized replay
        if weights is None:
            weights = np.ones(self.batch_size)

        # compute gradients and update parameters
        _, td_error = self.sess.run([self.Q_update_op, self.td_error_op], \
                {self.inputs: states, self.target_values: target_values, self.actions: actions, self.is_weights: weights})

        # update priority replay priorities
        if indices is not None:
            td_error = td_error.ravel()[np.flatnonzero(
                td_error)]  # shape (batch_size, )
            self.replay.update_priorities(
                indices,
                np.abs(td_error) + 1e-3
            )  # add small number to prevent never sampling 0 error transitions

    # returns eps-greedy action with respect to Q
    def get_eps_action(self, state, eps):
        if self.env.np_random.uniform() < eps:
            action = self.env.sample()
        else:
            Q = self.sess.run(self.Q_out_op, {self.inputs: np.array([state])})
            max_actions = np.where(np.ravel(Q) == Q.max())[0]
            action = self.env.np_random.choice(
                max_actions)  # to select argmax randomly
        return action
Ejemplo n.º 9
0
def end_to_end_training(
        epochs: int,
        model_cls: BaseConditionalGenerationOracle,
        optimizer_cls: BaseOptimizer,
        optimized_function_cls: BaseConditionalGenerationOracle,
        logger: BaseLogger,
        model_config: dict,
        optimizer_config: dict,
        n_samples_per_dim: int,
        step_data_gen: float,
        n_samples: int,
        current_psi: Union[List[float], torch.tensor],
        reuse_optimizer: bool = False,
        reuse_model: bool = False,
        shift_model: bool = False,
        finetune_model: bool = False,
        use_experience_replay: bool = True,
        add_box_constraints: bool = False,
        experiment=None,
        scale_psi=False):
    """

    :param epochs: int
        number of local training steps to perfomr
    :param model_cls: BaseConditionalGenerationOracle
        model that is able to generate samples and calculate loss function
    :param optimizer_cls: BaseOptimizer
    :param logger: BaseLogger
    :param model_config: dict
    :param optimizer_config: dict
    :param n_samples_per_dim: int
    :param step_data_gen: float
    :param n_samples: int
    :param current_psi:
    :param reuse_model:
    :param reuse_optimizer:
    :param finetune_model:
    :param shift_model:

    :return:
    """
    gan_logger = GANLogger(experiment)
    # gan_logger = RegressionLogger(experiment)
    # gan_logger = None

    y_sampler = optimized_function_cls(device=device, psi_init=current_psi)
    model = model_cls(y_model=y_sampler, **model_config,
                      logger=gan_logger).to(device)

    optimizer = optimizer_cls(oracle=model, x=current_psi, **optimizer_config)
    print(model_config)
    exp_replay = ExperienceReplay(psi_dim=model_config['psi_dim'],
                                  y_dim=model_config['y_dim'],
                                  x_dim=model_config['x_dim'],
                                  device=device)
    weights = None

    logger.log_performance(y_sampler=y_sampler,
                           current_psi=current_psi,
                           n_samples=n_samples)
    for epoch in range(epochs):
        # generate new data sample
        x, condition = y_sampler.generate_local_data_lhs(
            n_samples_per_dim=n_samples_per_dim,
            step=step_data_gen,
            current_psi=current_psi,
            n_samples=n_samples)
        if x is None and condition is None:
            print("Empty training set, continue")
            continue
        x_exp_replay, condition_exp_replay = exp_replay.extract(
            psi=current_psi, step=step_data_gen)
        exp_replay.add(y=x, condition=condition)
        x = torch.cat([x, x_exp_replay], dim=0)
        condition = torch.cat([condition, condition_exp_replay], dim=0)
        used_samples = n_samples

        # breaking things
        if model_config.get("predict_risk", False):
            condition = condition[::n_samples_per_dim, :current_psi.shape[0]]
            x = y_sampler.func(condition,
                               num_repetitions=n_samples_per_dim).reshape(
                                   -1, x.shape[1])
        print(x.shape, condition.shape)
        ## Scale train set
        if scale_psi:
            scale_factor = 10
            feature_max = condition[:, :model_config['psi_dim']].max(axis=0)[0]
            y_sampler.scale_factor = scale_factor
            y_sampler.feature_max = feature_max
            y_sampler.scale_psi = True
            print("MAX FEATURES", feature_max)
            condition[:, :
                      model_config['psi_dim']] /= feature_max * scale_factor
            current_psi = current_psi / feature_max * scale_factor
            print(feature_max.shape, current_psi.shape)
            print("MAX PSI", current_psi)

        model.train()
        if reuse_model:
            if shift_model:
                if isinstance(model, ShiftedOracle):
                    model.set_shift(current_psi.clone().detach())
                else:
                    model = ShiftedOracle(oracle=model,
                                          shift=current_psi.clone().detach())
                model.fit(x, condition=condition, weights=weights)
            else:
                model.fit(x, condition=condition, weights=weights)
        else:
            # if not reusing model
            # then at each epoch re-initialize and re-fit
            model = model_cls(y_model=y_sampler,
                              **model_config,
                              logger=gan_logger).to(device)
            print("y_shape: {}, cond: {}".format(x.shape, condition.shape))
            model.fit(x, condition=condition, weights=weights)

        model.eval()

        if reuse_optimizer:
            optimizer.update(oracle=model, x=current_psi)
        else:
            # find new psi
            optimizer = optimizer_cls(oracle=model,
                                      x=current_psi,
                                      **optimizer_config)

        if add_box_constraints:
            box_barriers = make_box_barriers(current_psi, step_data_gen)
            add_barriers_to_oracle(oracle=model, barriers=box_barriers)

        previous_psi = current_psi.clone()
        current_psi, status, history = optimizer.optimize()
        if scale_psi:
            current_psi, status, history = optimizer.optimize()
            current_psi = current_psi / scale_factor * feature_max
            y_sampler.scale_psi = False
            print("NEW_PSI: ", current_psi)

        try:
            # logging optimization, i.e. statistics of psi
            logger.log_grads(model,
                             y_sampler,
                             current_psi,
                             n_samples_per_dim,
                             log_grad_diff=False)
            logger.log_optimizer(optimizer)
            logger.log_performance(y_sampler=y_sampler,
                                   current_psi=current_psi,
                                   n_samples=n_samples)
            experiment.log_metric("used_samples_per_step", used_samples)
            experiment.log_metric("sample_size", len(x))

        except Exception as e:
            print(e)
            print(traceback.format_exc())
            # raise
        torch.cuda.empty_cache()
    logger.func_saver.join()
    return